diff --git a/apps/chatgpt_data/__init__.py b/apps/chatgpt_data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/apps/chatgpt_data/chatgpt_reader.py b/apps/chatgpt_data/chatgpt_reader.py new file mode 100644 index 0000000..dff964f --- /dev/null +++ b/apps/chatgpt_data/chatgpt_reader.py @@ -0,0 +1,413 @@ +""" +ChatGPT export data reader. + +Reads and processes ChatGPT export data from chat.html files. +""" + +import re +from pathlib import Path +from typing import Any +from zipfile import ZipFile + +from bs4 import BeautifulSoup +from llama_index.core import Document +from llama_index.core.readers.base import BaseReader + + +class ChatGPTReader(BaseReader): + """ + ChatGPT export data reader. + + Reads ChatGPT conversation data from exported chat.html files or zip archives. + Processes conversations into structured documents with metadata. + """ + + def __init__(self, concatenate_conversations: bool = True) -> None: + """ + Initialize. + + Args: + concatenate_conversations: Whether to concatenate messages within conversations for better context + """ + try: + from bs4 import BeautifulSoup # noqa + except ImportError: + raise ImportError("`beautifulsoup4` package not found: `pip install beautifulsoup4`") + + self.concatenate_conversations = concatenate_conversations + + def _extract_html_from_zip(self, zip_path: Path) -> str | None: + """ + Extract chat.html from ChatGPT export zip file. + + Args: + zip_path: Path to the ChatGPT export zip file + + Returns: + HTML content as string, or None if not found + """ + try: + with ZipFile(zip_path, "r") as zip_file: + # Look for chat.html or conversations.html + html_files = [ + f + for f in zip_file.namelist() + if f.endswith(".html") and ("chat" in f.lower() or "conversation" in f.lower()) + ] + + if not html_files: + print(f"No HTML chat file found in {zip_path}") + return None + + # Use the first HTML file found + html_file = html_files[0] + print(f"Found HTML file: {html_file}") + + with zip_file.open(html_file) as f: + return f.read().decode("utf-8", errors="ignore") + + except Exception as e: + print(f"Error extracting HTML from zip {zip_path}: {e}") + return None + + def _parse_chatgpt_html(self, html_content: str) -> list[dict]: + """ + Parse ChatGPT HTML export to extract conversations. + + Args: + html_content: HTML content from ChatGPT export + + Returns: + List of conversation dictionaries + """ + soup = BeautifulSoup(html_content, "html.parser") + conversations = [] + + # Try different possible structures for ChatGPT exports + # Structure 1: Look for conversation containers + conversation_containers = soup.find_all( + ["div", "section"], class_=re.compile(r"conversation|chat", re.I) + ) + + if not conversation_containers: + # Structure 2: Look for message containers directly + conversation_containers = [soup] # Use the entire document as one conversation + + for container in conversation_containers: + conversation = self._extract_conversation_from_container(container) + if conversation and conversation.get("messages"): + conversations.append(conversation) + + # If no structured conversations found, try to extract all text as one conversation + if not conversations: + all_text = soup.get_text(separator="\n", strip=True) + if all_text: + conversations.append( + { + "title": "ChatGPT Conversation", + "messages": [{"role": "mixed", "content": all_text, "timestamp": None}], + "timestamp": None, + } + ) + + return conversations + + def _extract_conversation_from_container(self, container) -> dict | None: + """ + Extract conversation data from a container element. + + Args: + container: BeautifulSoup element containing conversation + + Returns: + Dictionary with conversation data or None + """ + messages = [] + + # Look for message elements with various possible structures + message_selectors = ['[class*="message"]', '[class*="chat"]', "[data-message]", "p", "div"] + + for selector in message_selectors: + message_elements = container.select(selector) + if message_elements: + break + else: + message_elements = [] + + # If no structured messages found, treat the entire container as one message + if not message_elements: + text_content = container.get_text(separator="\n", strip=True) + if text_content: + messages.append({"role": "mixed", "content": text_content, "timestamp": None}) + else: + for element in message_elements: + message = self._extract_message_from_element(element) + if message: + messages.append(message) + + if not messages: + return None + + # Try to extract conversation title + title_element = container.find(["h1", "h2", "h3", "title"]) + title = title_element.get_text(strip=True) if title_element else "ChatGPT Conversation" + + # Try to extract timestamp from various possible locations + timestamp = self._extract_timestamp_from_container(container) + + return {"title": title, "messages": messages, "timestamp": timestamp} + + def _extract_message_from_element(self, element) -> dict | None: + """ + Extract message data from an element. + + Args: + element: BeautifulSoup element containing message + + Returns: + Dictionary with message data or None + """ + text_content = element.get_text(separator=" ", strip=True) + + # Skip empty or very short messages + if not text_content or len(text_content.strip()) < 3: + return None + + # Try to determine role (user/assistant) from class names or content + role = "mixed" # Default role + + class_names = " ".join(element.get("class", [])).lower() + if "user" in class_names or "human" in class_names: + role = "user" + elif "assistant" in class_names or "ai" in class_names or "gpt" in class_names: + role = "assistant" + elif text_content.lower().startswith(("you:", "user:", "me:")): + role = "user" + text_content = re.sub(r"^(you|user|me):\s*", "", text_content, flags=re.IGNORECASE) + elif text_content.lower().startswith(("chatgpt:", "assistant:", "ai:")): + role = "assistant" + text_content = re.sub( + r"^(chatgpt|assistant|ai):\s*", "", text_content, flags=re.IGNORECASE + ) + + # Try to extract timestamp + timestamp = self._extract_timestamp_from_element(element) + + return {"role": role, "content": text_content, "timestamp": timestamp} + + def _extract_timestamp_from_element(self, element) -> str | None: + """Extract timestamp from element.""" + # Look for timestamp in various attributes and child elements + timestamp_attrs = ["data-timestamp", "timestamp", "datetime"] + for attr in timestamp_attrs: + if element.get(attr): + return element.get(attr) + + # Look for time elements + time_element = element.find("time") + if time_element: + return time_element.get("datetime") or time_element.get_text(strip=True) + + # Look for date-like text patterns + text = element.get_text() + date_patterns = [r"\d{4}-\d{2}-\d{2}", r"\d{1,2}/\d{1,2}/\d{4}", r"\w+ \d{1,2}, \d{4}"] + + for pattern in date_patterns: + match = re.search(pattern, text) + if match: + return match.group() + + return None + + def _extract_timestamp_from_container(self, container) -> str | None: + """Extract timestamp from conversation container.""" + return self._extract_timestamp_from_element(container) + + def _create_concatenated_content(self, conversation: dict) -> str: + """ + Create concatenated content from conversation messages. + + Args: + conversation: Dictionary containing conversation data + + Returns: + Formatted concatenated content + """ + title = conversation.get("title", "ChatGPT Conversation") + messages = conversation.get("messages", []) + timestamp = conversation.get("timestamp", "Unknown") + + # Build message content + message_parts = [] + for i, message in enumerate(messages): + role = message.get("role", "mixed") + content = message.get("content", "") + msg_timestamp = message.get("timestamp", "") + + if role == "user": + prefix = "[You]" + elif role == "assistant": + prefix = "[ChatGPT]" + else: + prefix = "[Message]" + + # Add timestamp if available + if msg_timestamp: + prefix += f" ({msg_timestamp})" + + message_parts.append(f"{prefix}: {content}") + + concatenated_text = "\n\n".join(message_parts) + + # Create final document content + doc_content = f"""Conversation: {title} +Date: {timestamp} +Messages ({len(messages)} messages): + +{concatenated_text} +""" + return doc_content + + def load_data(self, input_dir: str | None = None, **load_kwargs: Any) -> list[Document]: + """ + Load ChatGPT export data. + + Args: + input_dir: Directory containing ChatGPT export files or path to specific file + **load_kwargs: + max_count (int): Maximum number of conversations to process + chatgpt_export_path (str): Specific path to ChatGPT export file/directory + include_metadata (bool): Whether to include metadata in documents + """ + docs: list[Document] = [] + max_count = load_kwargs.get("max_count", -1) + chatgpt_export_path = load_kwargs.get("chatgpt_export_path", input_dir) + include_metadata = load_kwargs.get("include_metadata", True) + + if not chatgpt_export_path: + print("No ChatGPT export path provided") + return docs + + export_path = Path(chatgpt_export_path) + + if not export_path.exists(): + print(f"ChatGPT export path not found: {export_path}") + return docs + + html_content = None + + # Handle different input types + if export_path.is_file(): + if export_path.suffix.lower() == ".zip": + # Extract HTML from zip file + html_content = self._extract_html_from_zip(export_path) + elif export_path.suffix.lower() == ".html": + # Read HTML file directly + try: + with open(export_path, encoding="utf-8", errors="ignore") as f: + html_content = f.read() + except Exception as e: + print(f"Error reading HTML file {export_path}: {e}") + return docs + else: + print(f"Unsupported file type: {export_path.suffix}") + return docs + + elif export_path.is_dir(): + # Look for HTML files in directory + html_files = list(export_path.glob("*.html")) + zip_files = list(export_path.glob("*.zip")) + + if html_files: + # Use first HTML file found + html_file = html_files[0] + print(f"Found HTML file: {html_file}") + try: + with open(html_file, encoding="utf-8", errors="ignore") as f: + html_content = f.read() + except Exception as e: + print(f"Error reading HTML file {html_file}: {e}") + return docs + + elif zip_files: + # Use first zip file found + zip_file = zip_files[0] + print(f"Found zip file: {zip_file}") + html_content = self._extract_html_from_zip(zip_file) + + else: + print(f"No HTML or zip files found in {export_path}") + return docs + + if not html_content: + print("No HTML content found to process") + return docs + + # Parse conversations from HTML + print("Parsing ChatGPT conversations from HTML...") + conversations = self._parse_chatgpt_html(html_content) + + if not conversations: + print("No conversations found in HTML content") + return docs + + print(f"Found {len(conversations)} conversations") + + # Process conversations into documents + count = 0 + for conversation in conversations: + if max_count > 0 and count >= max_count: + break + + if self.concatenate_conversations: + # Create one document per conversation with concatenated messages + doc_content = self._create_concatenated_content(conversation) + + metadata = {} + if include_metadata: + metadata = { + "title": conversation.get("title", "ChatGPT Conversation"), + "timestamp": conversation.get("timestamp", "Unknown"), + "message_count": len(conversation.get("messages", [])), + "source": "ChatGPT Export", + } + + doc = Document(text=doc_content, metadata=metadata) + docs.append(doc) + count += 1 + + else: + # Create separate documents for each message + for message in conversation.get("messages", []): + if max_count > 0 and count >= max_count: + break + + role = message.get("role", "mixed") + content = message.get("content", "") + msg_timestamp = message.get("timestamp", "") + + if not content.strip(): + continue + + # Create document content with context + doc_content = f"""Conversation: {conversation.get("title", "ChatGPT Conversation")} +Role: {role} +Timestamp: {msg_timestamp or conversation.get("timestamp", "Unknown")} +Message: {content} +""" + + metadata = {} + if include_metadata: + metadata = { + "conversation_title": conversation.get("title", "ChatGPT Conversation"), + "role": role, + "timestamp": msg_timestamp or conversation.get("timestamp", "Unknown"), + "source": "ChatGPT Export", + } + + doc = Document(text=doc_content, metadata=metadata) + docs.append(doc) + count += 1 + + print(f"Created {len(docs)} documents from ChatGPT export") + return docs diff --git a/apps/chatgpt_rag.py b/apps/chatgpt_rag.py new file mode 100644 index 0000000..3c92d04 --- /dev/null +++ b/apps/chatgpt_rag.py @@ -0,0 +1,186 @@ +""" +ChatGPT RAG example using the unified interface. +Supports ChatGPT export data from chat.html files. +""" + +import sys +from pathlib import Path + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent)) + +from base_rag_example import BaseRAGExample +from chunking import create_text_chunks + +from .chatgpt_data.chatgpt_reader import ChatGPTReader + + +class ChatGPTRAG(BaseRAGExample): + """RAG example for ChatGPT conversation data.""" + + def __init__(self): + # Set default values BEFORE calling super().__init__ + self.max_items_default = -1 # Process all conversations by default + self.embedding_model_default = ( + "sentence-transformers/all-MiniLM-L6-v2" # Fast 384-dim model + ) + + super().__init__( + name="ChatGPT", + description="Process and query ChatGPT conversation exports with LEANN", + default_index_name="chatgpt_conversations_index", + ) + + def _add_specific_arguments(self, parser): + """Add ChatGPT-specific arguments.""" + chatgpt_group = parser.add_argument_group("ChatGPT Parameters") + chatgpt_group.add_argument( + "--export-path", + type=str, + default="./chatgpt_export", + help="Path to ChatGPT export file (.zip or .html) or directory containing exports (default: ./chatgpt_export)", + ) + chatgpt_group.add_argument( + "--concatenate-conversations", + action="store_true", + default=True, + help="Concatenate messages within conversations for better context (default: True)", + ) + chatgpt_group.add_argument( + "--separate-messages", + action="store_true", + help="Process each message as a separate document (overrides --concatenate-conversations)", + ) + chatgpt_group.add_argument( + "--chunk-size", type=int, default=512, help="Text chunk size (default: 512)" + ) + chatgpt_group.add_argument( + "--chunk-overlap", type=int, default=128, help="Text chunk overlap (default: 128)" + ) + + def _find_chatgpt_exports(self, export_path: Path) -> list[Path]: + """ + Find ChatGPT export files in the given path. + + Args: + export_path: Path to search for exports + + Returns: + List of paths to ChatGPT export files + """ + export_files = [] + + if export_path.is_file(): + if export_path.suffix.lower() in [".zip", ".html"]: + export_files.append(export_path) + elif export_path.is_dir(): + # Look for zip and html files + export_files.extend(export_path.glob("*.zip")) + export_files.extend(export_path.glob("*.html")) + + return export_files + + async def load_data(self, args) -> list[str]: + """Load ChatGPT export data and convert to text chunks.""" + export_path = Path(args.export_path) + + if not export_path.exists(): + print(f"ChatGPT export path not found: {export_path}") + print( + "Please ensure you have exported your ChatGPT data and placed it in the correct location." + ) + print("\nTo export your ChatGPT data:") + print("1. Sign in to ChatGPT") + print("2. Click on your profile icon → Settings → Data Controls") + print("3. Click 'Export' under Export Data") + print("4. Download the zip file from the email link") + print("5. Extract or place the file/directory at the specified path") + return [] + + # Find export files + export_files = self._find_chatgpt_exports(export_path) + + if not export_files: + print(f"No ChatGPT export files (.zip or .html) found in: {export_path}") + return [] + + print(f"Found {len(export_files)} ChatGPT export files") + + # Create reader with appropriate settings + concatenate = args.concatenate_conversations and not args.separate_messages + reader = ChatGPTReader(concatenate_conversations=concatenate) + + # Process each export file + all_documents = [] + total_processed = 0 + + for i, export_file in enumerate(export_files): + print(f"\nProcessing export file {i + 1}/{len(export_files)}: {export_file.name}") + + try: + # Apply max_items limit per file + max_per_file = -1 + if args.max_items > 0: + remaining = args.max_items - total_processed + if remaining <= 0: + break + max_per_file = remaining + + # Load conversations + documents = reader.load_data( + chatgpt_export_path=str(export_file), + max_count=max_per_file, + include_metadata=True, + ) + + if documents: + all_documents.extend(documents) + total_processed += len(documents) + print(f"Processed {len(documents)} conversations from this file") + else: + print(f"No conversations loaded from {export_file}") + + except Exception as e: + print(f"Error processing {export_file}: {e}") + continue + + if not all_documents: + print("No conversations found to process!") + print("\nTroubleshooting:") + print("- Ensure the export file is a valid ChatGPT export") + print("- Check that the HTML file contains conversation data") + print("- Try extracting the zip file and pointing to the HTML file directly") + return [] + + print(f"\nTotal conversations processed: {len(all_documents)}") + print("Now starting to split into text chunks... this may take some time") + + # Convert to text chunks + all_texts = create_text_chunks( + all_documents, chunk_size=args.chunk_size, chunk_overlap=args.chunk_overlap + ) + + print(f"Created {len(all_texts)} text chunks from {len(all_documents)} conversations") + return all_texts + + +if __name__ == "__main__": + import asyncio + + # Example queries for ChatGPT RAG + print("\nšŸ¤– ChatGPT RAG Example") + print("=" * 50) + print("\nExample queries you can try:") + print("- 'What did I ask about Python programming?'") + print("- 'Show me conversations about machine learning'") + print("- 'Find discussions about travel planning'") + print("- 'What advice did ChatGPT give me about career development?'") + print("- 'Search for conversations about cooking recipes'") + print("\nTo get started:") + print("1. Export your ChatGPT data from Settings → Data Controls → Export") + print("2. Place the downloaded zip file or extracted HTML in ./chatgpt_export/") + print("3. Run this script to build your personal ChatGPT knowledge base!") + print("\nOr run without --query for interactive mode\n") + + rag = ChatGPTRAG() + asyncio.run(rag.run())