LEANN/apps/chatgpt_data/chatgpt_reader.py

"""
ChatGPT export data reader.

Reads and processes ChatGPT export data from chat.html files.
"""

import re
from pathlib import Path
from typing import Any
from zipfile import ZipFile

from bs4 import BeautifulSoup
from llama_index.core import Document
from llama_index.core.readers.base import BaseReader


class ChatGPTReader(BaseReader):
    """
    ChatGPT export data reader.

    Reads ChatGPT conversation data from exported chat.html files or zip archives.
    Processes conversations into structured documents with metadata.
    """

    def __init__(self, concatenate_conversations: bool = True) -> None:
        """
        Initialize.

        Args:
            concatenate_conversations: Whether to concatenate messages within conversations for better context
        """
        try:
            from bs4 import BeautifulSoup  # noqa
        except ImportError:
            raise ImportError("`beautifulsoup4` package not found: `pip install beautifulsoup4`")

        self.concatenate_conversations = concatenate_conversations

    def _extract_html_from_zip(self, zip_path: Path) -> str | None:
        """
        Extract chat.html from ChatGPT export zip file.

        Args:
            zip_path: Path to the ChatGPT export zip file

        Returns:
            HTML content as string, or None if not found
        """
        try:
            with ZipFile(zip_path, "r") as zip_file:
                # Look for chat.html or conversations.html
                html_files = [
                    f
                    for f in zip_file.namelist()
                    if f.endswith(".html") and ("chat" in f.lower() or "conversation" in f.lower())
                ]

                if not html_files:
                    print(f"No HTML chat file found in {zip_path}")
                    return None

                # Use the first HTML file found
                html_file = html_files[0]
                print(f"Found HTML file: {html_file}")

                with zip_file.open(html_file) as f:
                    return f.read().decode("utf-8", errors="ignore")

        except Exception as e:
            print(f"Error extracting HTML from zip {zip_path}: {e}")
            return None

    def _parse_chatgpt_html(self, html_content: str) -> list[dict]:
        """
        Parse ChatGPT HTML export to extract conversations.

        Args:
            html_content: HTML content from ChatGPT export

        Returns:
            List of conversation dictionaries
        """
        soup = BeautifulSoup(html_content, "html.parser")
        conversations = []

        # Try different possible structures for ChatGPT exports
        # Structure 1: Look for conversation containers
        conversation_containers = soup.find_all(
            ["div", "section"], class_=re.compile(r"conversation|chat", re.I)
        )

        if not conversation_containers:
            # Structure 2: Look for message containers directly
            conversation_containers = [soup]  # Use the entire document as one conversation

        for container in conversation_containers:
            conversation = self._extract_conversation_from_container(container)
            if conversation and conversation.get("messages"):
                conversations.append(conversation)

        # If no structured conversations found, try to extract all text as one conversation
        if not conversations:
            all_text = soup.get_text(separator="\n", strip=True)
            if all_text:
                conversations.append(
                    {
                        "title": "ChatGPT Conversation",
                        "messages": [{"role": "mixed", "content": all_text, "timestamp": None}],
                        "timestamp": None,
                    }
                )

        return conversations

    def _extract_conversation_from_container(self, container) -> dict | None:
        """
        Extract conversation data from a container element.

        Args:
            container: BeautifulSoup element containing conversation

        Returns:
            Dictionary with conversation data or None
        """
        messages = []

        # Look for message elements with various possible structures
        message_selectors = ['[class*="message"]', '[class*="chat"]', "[data-message]", "p", "div"]

        for selector in message_selectors:
            message_elements = container.select(selector)
            if message_elements:
                break
        else:
            message_elements = []

        # If no structured messages found, treat the entire container as one message
        if not message_elements:
            text_content = container.get_text(separator="\n", strip=True)
            if text_content:
                messages.append({"role": "mixed", "content": text_content, "timestamp": None})
        else:
            for element in message_elements:
                message = self._extract_message_from_element(element)
                if message:
                    messages.append(message)

        if not messages:
            return None

        # Try to extract conversation title
        title_element = container.find(["h1", "h2", "h3", "title"])
        title = title_element.get_text(strip=True) if title_element else "ChatGPT Conversation"

        # Try to extract timestamp from various possible locations
        timestamp = self._extract_timestamp_from_container(container)

        return {"title": title, "messages": messages, "timestamp": timestamp}

    def _extract_message_from_element(self, element) -> dict | None:
        """
        Extract message data from an element.

        Args:
            element: BeautifulSoup element containing message

        Returns:
            Dictionary with message data or None
        """
        text_content = element.get_text(separator=" ", strip=True)

        # Skip empty or very short messages
        if not text_content or len(text_content.strip()) < 3:
            return None

        # Try to determine role (user/assistant) from class names or content
        role = "mixed"  # Default role

        class_names = " ".join(element.get("class", [])).lower()
        if "user" in class_names or "human" in class_names:
            role = "user"
        elif "assistant" in class_names or "ai" in class_names or "gpt" in class_names:
            role = "assistant"
        elif text_content.lower().startswith(("you:", "user:", "me:")):
            role = "user"
            text_content = re.sub(r"^(you|user|me):\s*", "", text_content, flags=re.IGNORECASE)
        elif text_content.lower().startswith(("chatgpt:", "assistant:", "ai:")):
            role = "assistant"
            text_content = re.sub(
                r"^(chatgpt|assistant|ai):\s*", "", text_content, flags=re.IGNORECASE
            )

        # Try to extract timestamp
        timestamp = self._extract_timestamp_from_element(element)

        return {"role": role, "content": text_content, "timestamp": timestamp}

    def _extract_timestamp_from_element(self, element) -> str | None:
        """Extract timestamp from element."""
        # Look for timestamp in various attributes and child elements
        timestamp_attrs = ["data-timestamp", "timestamp", "datetime"]
        for attr in timestamp_attrs:
            if element.get(attr):
                return element.get(attr)

        # Look for time elements
        time_element = element.find("time")
        if time_element:
            return time_element.get("datetime") or time_element.get_text(strip=True)

        # Look for date-like text patterns
        text = element.get_text()
        date_patterns = [r"\d{4}-\d{2}-\d{2}", r"\d{1,2}/\d{1,2}/\d{4}", r"\w+ \d{1,2}, \d{4}"]

        for pattern in date_patterns:
            match = re.search(pattern, text)
            if match:
                return match.group()

        return None

    def _extract_timestamp_from_container(self, container) -> str | None:
        """Extract timestamp from conversation container."""
        return self._extract_timestamp_from_element(container)

    def _create_concatenated_content(self, conversation: dict) -> str:
        """
        Create concatenated content from conversation messages.

        Args:
            conversation: Dictionary containing conversation data

        Returns:
            Formatted concatenated content
        """
        title = conversation.get("title", "ChatGPT Conversation")
        messages = conversation.get("messages", [])
        timestamp = conversation.get("timestamp", "Unknown")

        # Build message content
        message_parts = []
        for message in messages:
            role = message.get("role", "mixed")
            content = message.get("content", "")
            msg_timestamp = message.get("timestamp", "")

            if role == "user":
                prefix = "[You]"
            elif role == "assistant":
                prefix = "[ChatGPT]"
            else:
                prefix = "[Message]"

            # Add timestamp if available
            if msg_timestamp:
                prefix += f" ({msg_timestamp})"

            message_parts.append(f"{prefix}: {content}")

        concatenated_text = "\n\n".join(message_parts)

        # Create final document content
        doc_content = f"""Conversation: {title}
Date: {timestamp}
Messages ({len(messages)} messages):

{concatenated_text}
"""
        return doc_content

    def load_data(self, input_dir: str | None = None, **load_kwargs: Any) -> list[Document]:
        """
        Load ChatGPT export data.

        Args:
            input_dir: Directory containing ChatGPT export files or path to specific file
            **load_kwargs:
                max_count (int): Maximum number of conversations to process
                chatgpt_export_path (str): Specific path to ChatGPT export file/directory
                include_metadata (bool): Whether to include metadata in documents
        """
        docs: list[Document] = []
        max_count = load_kwargs.get("max_count", -1)
        chatgpt_export_path = load_kwargs.get("chatgpt_export_path", input_dir)
        include_metadata = load_kwargs.get("include_metadata", True)

        if not chatgpt_export_path:
            print("No ChatGPT export path provided")
            return docs

        export_path = Path(chatgpt_export_path)

        if not export_path.exists():
            print(f"ChatGPT export path not found: {export_path}")
            return docs

        html_content = None

        # Handle different input types
        if export_path.is_file():
            if export_path.suffix.lower() == ".zip":
                # Extract HTML from zip file
                html_content = self._extract_html_from_zip(export_path)
            elif export_path.suffix.lower() == ".html":
                # Read HTML file directly
                try:
                    with open(export_path, encoding="utf-8", errors="ignore") as f:
                        html_content = f.read()
                except Exception as e:
                    print(f"Error reading HTML file {export_path}: {e}")
                    return docs
            else:
                print(f"Unsupported file type: {export_path.suffix}")
                return docs

        elif export_path.is_dir():
            # Look for HTML files in directory
            html_files = list(export_path.glob("*.html"))
            zip_files = list(export_path.glob("*.zip"))

            if html_files:
                # Use first HTML file found
                html_file = html_files[0]
                print(f"Found HTML file: {html_file}")
                try:
                    with open(html_file, encoding="utf-8", errors="ignore") as f:
                        html_content = f.read()
                except Exception as e:
                    print(f"Error reading HTML file {html_file}: {e}")
                    return docs

            elif zip_files:
                # Use first zip file found
                zip_file = zip_files[0]
                print(f"Found zip file: {zip_file}")
                html_content = self._extract_html_from_zip(zip_file)

            else:
                print(f"No HTML or zip files found in {export_path}")
                return docs

        if not html_content:
            print("No HTML content found to process")
            return docs

        # Parse conversations from HTML
        print("Parsing ChatGPT conversations from HTML...")
        conversations = self._parse_chatgpt_html(html_content)

        if not conversations:
            print("No conversations found in HTML content")
            return docs

        print(f"Found {len(conversations)} conversations")

        # Process conversations into documents
        count = 0
        for conversation in conversations:
            if max_count > 0 and count >= max_count:
                break

            if self.concatenate_conversations:
                # Create one document per conversation with concatenated messages
                doc_content = self._create_concatenated_content(conversation)

                metadata = {}
                if include_metadata:
                    metadata = {
                        "title": conversation.get("title", "ChatGPT Conversation"),
                        "timestamp": conversation.get("timestamp", "Unknown"),
                        "message_count": len(conversation.get("messages", [])),
                        "source": "ChatGPT Export",
                    }

                doc = Document(text=doc_content, metadata=metadata)
                docs.append(doc)
                count += 1

            else:
                # Create separate documents for each message
                for message in conversation.get("messages", []):
                    if max_count > 0 and count >= max_count:
                        break

                    role = message.get("role", "mixed")
                    content = message.get("content", "")
                    msg_timestamp = message.get("timestamp", "")

                    if not content.strip():
                        continue

                    # Create document content with context
                    doc_content = f"""Conversation: {conversation.get("title", "ChatGPT Conversation")}
Role: {role}
Timestamp: {msg_timestamp or conversation.get("timestamp", "Unknown")}
Message: {content}
"""

                    metadata = {}
                    if include_metadata:
                        metadata = {
                            "conversation_title": conversation.get("title", "ChatGPT Conversation"),
                            "role": role,
                            "timestamp": msg_timestamp or conversation.get("timestamp", "Unknown"),
                            "source": "ChatGPT Export",
                        }

                    doc = Document(text=doc_content, metadata=metadata)
                    docs.append(doc)
                    count += 1

        print(f"Created {len(docs)} documents from ChatGPT export")
        return docs