LEANN/apps/claude_data/claude_reader.py

"""
Claude export data reader.

Reads and processes Claude conversation data from exported JSON files.
"""

import json
from pathlib import Path
from typing import Any
from zipfile import ZipFile

from llama_index.core import Document
from llama_index.core.readers.base import BaseReader


class ClaudeReader(BaseReader):
    """
    Claude export data reader.

    Reads Claude conversation data from exported JSON files or zip archives.
    Processes conversations into structured documents with metadata.
    """

    def __init__(self, concatenate_conversations: bool = True) -> None:
        """
        Initialize.

        Args:
            concatenate_conversations: Whether to concatenate messages within conversations for better context
        """
        self.concatenate_conversations = concatenate_conversations

    def _extract_json_from_zip(self, zip_path: Path) -> list[str]:
        """
        Extract JSON files from Claude export zip file.

        Args:
            zip_path: Path to the Claude export zip file

        Returns:
            List of JSON content strings, or empty list if not found
        """
        json_contents = []
        try:
            with ZipFile(zip_path, "r") as zip_file:
                # Look for JSON files
                json_files = [f for f in zip_file.namelist() if f.endswith(".json")]

                if not json_files:
                    print(f"No JSON files found in {zip_path}")
                    return []

                print(f"Found {len(json_files)} JSON files in archive")

                for json_file in json_files:
                    with zip_file.open(json_file) as f:
                        content = f.read().decode("utf-8", errors="ignore")
                        json_contents.append(content)

        except Exception as e:
            print(f"Error extracting JSON from zip {zip_path}: {e}")

        return json_contents

    def _parse_claude_json(self, json_content: str) -> list[dict]:
        """
        Parse Claude JSON export to extract conversations.

        Args:
            json_content: JSON content from Claude export

        Returns:
            List of conversation dictionaries
        """
        try:
            data = json.loads(json_content)
        except json.JSONDecodeError as e:
            print(f"Error parsing JSON: {e}")
            return []

        conversations = []

        # Handle different possible JSON structures
        if isinstance(data, list):
            # If data is a list of conversations
            for item in data:
                conversation = self._extract_conversation_from_json(item)
                if conversation:
                    conversations.append(conversation)
        elif isinstance(data, dict):
            # Check for common structures
            if "conversations" in data:
                # Structure: {"conversations": [...]}
                for item in data["conversations"]:
                    conversation = self._extract_conversation_from_json(item)
                    if conversation:
                        conversations.append(conversation)
            elif "messages" in data:
                # Single conversation with messages
                conversation = self._extract_conversation_from_json(data)
                if conversation:
                    conversations.append(conversation)
            else:
                # Try to treat the whole object as a conversation
                conversation = self._extract_conversation_from_json(data)
                if conversation:
                    conversations.append(conversation)

        return conversations

    def _extract_conversation_from_json(self, conv_data: dict) -> dict | None:
        """
        Extract conversation data from a JSON object.

        Args:
            conv_data: Dictionary containing conversation data

        Returns:
            Dictionary with conversation data or None
        """
        if not isinstance(conv_data, dict):
            return None

        messages = []

        # Look for messages in various possible structures
        message_sources = []
        if "messages" in conv_data:
            message_sources = conv_data["messages"]
        elif "chat" in conv_data:
            message_sources = conv_data["chat"]
        elif "conversation" in conv_data:
            message_sources = conv_data["conversation"]
        else:
            # If no clear message structure, try to extract from the object itself
            if "content" in conv_data and "role" in conv_data:
                message_sources = [conv_data]

        for msg_data in message_sources:
            message = self._extract_message_from_json(msg_data)
            if message:
                messages.append(message)

        if not messages:
            return None

        # Extract conversation metadata
        title = self._extract_title_from_conversation(conv_data, messages)
        timestamp = self._extract_timestamp_from_conversation(conv_data)

        return {"title": title, "messages": messages, "timestamp": timestamp}

    def _extract_message_from_json(self, msg_data: dict) -> dict | None:
        """
        Extract message data from a JSON message object.

        Args:
            msg_data: Dictionary containing message data

        Returns:
            Dictionary with message data or None
        """
        if not isinstance(msg_data, dict):
            return None

        # Extract content from various possible fields
        content = ""
        content_fields = ["content", "text", "message", "body"]
        for field in content_fields:
            if msg_data.get(field):
                content = str(msg_data[field])
                break

        if not content or len(content.strip()) < 3:
            return None

        # Extract role (user/assistant/human/ai/claude)
        role = "mixed"  # Default role
        role_fields = ["role", "sender", "from", "author", "type"]
        for field in role_fields:
            if msg_data.get(field):
                role_value = str(msg_data[field]).lower()
                if role_value in ["user", "human", "person"]:
                    role = "user"
                elif role_value in ["assistant", "ai", "claude", "bot"]:
                    role = "assistant"
                break

        # Extract timestamp
        timestamp = self._extract_timestamp_from_message(msg_data)

        return {"role": role, "content": content, "timestamp": timestamp}

    def _extract_timestamp_from_message(self, msg_data: dict) -> str | None:
        """Extract timestamp from message data."""
        timestamp_fields = ["timestamp", "created_at", "date", "time"]
        for field in timestamp_fields:
            if msg_data.get(field):
                return str(msg_data[field])
        return None

    def _extract_timestamp_from_conversation(self, conv_data: dict) -> str | None:
        """Extract timestamp from conversation data."""
        timestamp_fields = ["timestamp", "created_at", "date", "updated_at", "last_updated"]
        for field in timestamp_fields:
            if conv_data.get(field):
                return str(conv_data[field])
        return None

    def _extract_title_from_conversation(self, conv_data: dict, messages: list) -> str:
        """Extract or generate title for conversation."""
        # Try to find explicit title
        title_fields = ["title", "name", "subject", "topic"]
        for field in title_fields:
            if conv_data.get(field):
                return str(conv_data[field])

        # Generate title from first user message
        for message in messages:
            if message.get("role") == "user":
                content = message.get("content", "")
                if content:
                    # Use first 50 characters as title
                    title = content[:50].strip()
                    if len(content) > 50:
                        title += "..."
                    return title

        return "Claude Conversation"

    def _create_concatenated_content(self, conversation: dict) -> str:
        """
        Create concatenated content from conversation messages.

        Args:
            conversation: Dictionary containing conversation data

        Returns:
            Formatted concatenated content
        """
        title = conversation.get("title", "Claude Conversation")
        messages = conversation.get("messages", [])
        timestamp = conversation.get("timestamp", "Unknown")

        # Build message content
        message_parts = []
        for message in messages:
            role = message.get("role", "mixed")
            content = message.get("content", "")
            msg_timestamp = message.get("timestamp", "")

            if role == "user":
                prefix = "[You]"
            elif role == "assistant":
                prefix = "[Claude]"
            else:
                prefix = "[Message]"

            # Add timestamp if available
            if msg_timestamp:
                prefix += f" ({msg_timestamp})"

            message_parts.append(f"{prefix}: {content}")

        concatenated_text = "\n\n".join(message_parts)

        # Create final document content
        doc_content = f"""Conversation: {title}
Date: {timestamp}
Messages ({len(messages)} messages):

{concatenated_text}
"""
        return doc_content

    def load_data(self, input_dir: str | None = None, **load_kwargs: Any) -> list[Document]:
        """
        Load Claude export data.

        Args:
            input_dir: Directory containing Claude export files or path to specific file
            **load_kwargs:
                max_count (int): Maximum number of conversations to process
                claude_export_path (str): Specific path to Claude export file/directory
                include_metadata (bool): Whether to include metadata in documents
        """
        docs: list[Document] = []
        max_count = load_kwargs.get("max_count", -1)
        claude_export_path = load_kwargs.get("claude_export_path", input_dir)
        include_metadata = load_kwargs.get("include_metadata", True)

        if not claude_export_path:
            print("No Claude export path provided")
            return docs

        export_path = Path(claude_export_path)

        if not export_path.exists():
            print(f"Claude export path not found: {export_path}")
            return docs

        json_contents = []

        # Handle different input types
        if export_path.is_file():
            if export_path.suffix.lower() == ".zip":
                # Extract JSON from zip file
                json_contents = self._extract_json_from_zip(export_path)
            elif export_path.suffix.lower() == ".json":
                # Read JSON file directly
                try:
                    with open(export_path, encoding="utf-8", errors="ignore") as f:
                        json_contents.append(f.read())
                except Exception as e:
                    print(f"Error reading JSON file {export_path}: {e}")
                    return docs
            else:
                print(f"Unsupported file type: {export_path.suffix}")
                return docs

        elif export_path.is_dir():
            # Look for JSON files in directory
            json_files = list(export_path.glob("*.json"))
            zip_files = list(export_path.glob("*.zip"))

            if json_files:
                print(f"Found {len(json_files)} JSON files in directory")
                for json_file in json_files:
                    try:
                        with open(json_file, encoding="utf-8", errors="ignore") as f:
                            json_contents.append(f.read())
                    except Exception as e:
                        print(f"Error reading JSON file {json_file}: {e}")
                        continue

            if zip_files:
                print(f"Found {len(zip_files)} ZIP files in directory")
                for zip_file in zip_files:
                    zip_contents = self._extract_json_from_zip(zip_file)
                    json_contents.extend(zip_contents)

            if not json_files and not zip_files:
                print(f"No JSON or ZIP files found in {export_path}")
                return docs

        if not json_contents:
            print("No JSON content found to process")
            return docs

        # Parse conversations from JSON content
        print("Parsing Claude conversations from JSON...")
        all_conversations = []
        for json_content in json_contents:
            conversations = self._parse_claude_json(json_content)
            all_conversations.extend(conversations)

        if not all_conversations:
            print("No conversations found in JSON content")
            return docs

        print(f"Found {len(all_conversations)} conversations")

        # Process conversations into documents
        count = 0
        for conversation in all_conversations:
            if max_count > 0 and count >= max_count:
                break

            if self.concatenate_conversations:
                # Create one document per conversation with concatenated messages
                doc_content = self._create_concatenated_content(conversation)

                metadata = {}
                if include_metadata:
                    metadata = {
                        "title": conversation.get("title", "Claude Conversation"),
                        "timestamp": conversation.get("timestamp", "Unknown"),
                        "message_count": len(conversation.get("messages", [])),
                        "source": "Claude Export",
                    }

                doc = Document(text=doc_content, metadata=metadata)
                docs.append(doc)
                count += 1

            else:
                # Create separate documents for each message
                for message in conversation.get("messages", []):
                    if max_count > 0 and count >= max_count:
                        break

                    role = message.get("role", "mixed")
                    content = message.get("content", "")
                    msg_timestamp = message.get("timestamp", "")

                    if not content.strip():
                        continue

                    # Create document content with context
                    doc_content = f"""Conversation: {conversation.get("title", "Claude Conversation")}
Role: {role}
Timestamp: {msg_timestamp or conversation.get("timestamp", "Unknown")}
Message: {content}
"""

                    metadata = {}
                    if include_metadata:
                        metadata = {
                            "conversation_title": conversation.get("title", "Claude Conversation"),
                            "role": role,
                            "timestamp": msg_timestamp or conversation.get("timestamp", "Unknown"),
                            "source": "Claude Export",
                        }

                    doc = Document(text=doc_content, metadata=metadata)
                    docs.append(doc)
                    count += 1

        print(f"Created {len(docs)} documents from Claude export")
        return docs