TheBigPromptLibrary/Tools/openai_gpts/gen_gpt_templ.py

"""
Generate markdown templates for ChatGPT GPTs by downloading and parsing their metadata.

By Elias Bachaalany

Usage:
    gen_template.py <gpt_url|gpt_id|g-prefixed_id> [--debug]
    gen_template.py @response_file.txt [--debug]
"""
import re
import json
import os
import sys
import argparse
import requests
from collections import namedtuple

# Named tuple for generate_template return value
GenerateTemplateResult = namedtuple('GenerateTemplateResult',
    ['template', 'short_url', 'gpt_id', 'parser'])

# Global template string
TEMPLATE = """GPT URL: https://chatgpt.com/g/{short_url}

GPT logo: <img src="{profile_pic}" width="100px" />

GPT Title: {title}

GPT Description: {description} - By {author_display_name}

GPT instructions:

```markdown

```"""

# ----------------------------------------------------------
def parse_gpt_id(url):
    """
    Parse the GPT ID from a ChatGPT URL

    Args:
        url (str): Full ChatGPT URL like https://chatgpt.com/g/g-VgbIr9TQQ-ida-pro-c-sdk-and-decompiler

    Returns:
        str or None: The GPT ID (e.g., 'VgbIr9TQQ') or None if not found
    """
    # Pattern to match g- followed by 9 characters
    pattern = r'/g/g-([a-zA-Z0-9]{9})'
    match = re.search(pattern, url)

    if match:
        return match.group(1)
    return None

# ----------------------------------------------------------
# Compile regex to extract streamController.enqueue arguments
_ENQUEUE_RE = re.compile(
    r'window\.__reactRouterContext\.streamController\.enqueue\(\s*'       # find the call
    r'(?P<q>["\'])'                         # capture whether it's " or '
    r'(?P<raw>(?:\\.|(?!\1).)*?)'           # any escaped-char or char not the opening quote
    r'(?P=q)\s*'                            # matching closing quote
    r'\)',
    flags=re.DOTALL
)

# ----------------------------------------------------------
def extract_enqueue_args(html_text, decode_escapes=True):
    """
    Scans html_text for all streamController.enqueue(...) calls,
    returns a list of the raw string-literals inside the quotes.
    """
    args = []
    for m in _ENQUEUE_RE.finditer(html_text):
        raw = m.group('raw')
        if decode_escapes:
            # Only decode actual escape sequences, not Unicode characters
            # This prevents double-encoding of emojis and other Unicode chars
            try:
                # First try to parse as JSON string to handle escapes properly
                raw = json.loads('"' + raw + '"')
            except:
                # Fallback to simple replacement of common escapes
                raw = raw.replace('\\n', '\n').replace('\\t', '\t').replace('\\"', '"').replace("\\'", "'").replace('\\\\', '\\')
        args.append(raw)
    return args


# ----------------------------------------------------------
class CustomGPTParser:
    def __init__(self):
        self._parse_cache = {}  # Cache for parsed data
        self._parsed_items = None  # Store parsed items internally

    def parse(self, source, debug: bool = False):
        # Determine if source is a filename or content
        # First check if it could be a file (avoid treating content as filename)
        is_likely_filename = (
            len(source) < 1000 and  # Reasonable filename length
            '|' not in source and  # Filenames don't contain pipes
            os.path.isfile(source)
        )

        if is_likely_filename:
            try:
                with open(source, encoding='utf-8') as f:
                    content = f.read()
            except Exception as e:
                return (False, f"Error reading file: {e}")
        else:
            # Treat as content
            content = source

        # Parse the content
        if not (enqueue_args := extract_enqueue_args(content)):
            msg = "No enqueue arguments found in the provided string."
            if debug:
                print(msg)
            return (False, msg)

        if not enqueue_args:  # Additional safety check for empty list
            msg = "No enqueue arguments found in the provided string."
            if debug:
                print(msg)
            return (False, msg)

        try:
            # Use the argument with the longest length (most likely the Gizmo data)
            s = max(enqueue_args, key=len)
            data = json.loads(s)
            parsed_items = []
            for item in data:
                if isinstance(item, dict):
                    for k, v in item.items():
                        parsed_items.append((k, v))
                else:
                    if debug:
                        print(f"   {item}  (type: {type(item).__name__})")
                    parsed_items.append(item)

            self._parsed_items = parsed_items
            return (True, None)

        except json.JSONDecodeError as e:
            return (False, f"JSON decoding error: {e}")

    def get_title(self):
        """
        Extract the title of the GPT by finding the item preceding 'description'.

        The algorithm walks through items to find 'description', then returns
        the immediately preceding item as the title.

        Returns:
            str: The title value or empty string on failure
        """
        # Check cache first
        if 'title' in self._parse_cache:
            return self._parse_cache['title']

        # Need parsed items to work with
        if not self._parsed_items:
            return ''

        # Convert to list if not already to allow indexing
        items_list = list(self._parsed_items)

        # Find 'description' and get the preceding item
        for i, item in enumerate(items_list):
            # Skip tuples
            if isinstance(item, tuple):
                continue

            # Found 'description'?
            if item == 'description' and i > 0:
                # Get the previous item as title
                prev_item = items_list[i - 1]

                # Make sure it's a string value, not a tuple
                if isinstance(prev_item, str):
                    self._parse_cache['title'] = prev_item
                    return prev_item

        # Not found
        return ''

    def get_author_display_name(self):
        """
        Extract the author display name by finding the item after 'user-{id}'.

        The pattern is:
        - 'user_id' (literal string)
        - 'user-{actual_user_id}' (e.g., 'user-IUwuaeXwGuwv0UoRPaeEqlzs')
        - '{author_display_name}' (e.g., 'Elias Bachaalany')

        Returns:
            str: The author display name or empty string on failure
        """
        # Check cache first
        if 'author_display_name' in self._parse_cache:
            return self._parse_cache['author_display_name']

        # Need parsed items to work with
        if not self._parsed_items:
            return ''

        # Convert to list if not already to allow indexing
        items_list = list(self._parsed_items)

        # Find pattern: 'user_id' -> 'user-{id}' -> '{display_name}'
        for i, item in enumerate(items_list):
            # Skip tuples
            if isinstance(item, tuple):
                continue

            # Found 'user_id'?
            if item == 'user_id' and i + 2 < len(items_list):
                # Check if next item is a user ID (starts with 'user-')
                next_item = items_list[i + 1]
                if isinstance(next_item, str) and next_item.startswith('user-'):
                    # The item after that should be the display name
                    display_name_item = items_list[i + 2]
                    if isinstance(display_name_item, str):
                        self._parse_cache['author_display_name'] = display_name_item
                        return display_name_item

        # Not found
        return ''
    def get_str_value(self, name: str, default: str = None):
        """
        Get a string value by name from the parsed items.

        Args:
            name: The key/name to search for
            default: Default value if not found

        Returns:
            str: The value associated with the name or default
        """
        # Check cache first
        if name in self._parse_cache:
            return self._parse_cache[name]

        # Need parsed items to work with
        if not self._parsed_items:
            return default

        # Search through items
        it = iter(self._parsed_items)
        for item in it:
            # Handle tuple items (key-value pairs from dictionaries)
            if isinstance(item, tuple):
                # We don't handle tuple items now
                continue

            # Handle flat list items (name followed by value)
            if item == name:
                try:
                    val = next(it)
                    # Cache and return the value
                    self._parse_cache[name] = str(val)
                    return str(val)
                except StopIteration:
                    return default

        return default

    def clear_cache(self):
        """Clear the internal cache"""
        self._parse_cache.clear()

    def get_parsed_items(self):
        """Get the parsed items (for backward compatibility)"""
        return self._parsed_items if self._parsed_items else []

    def dump(self, safe_ascii=True):
        """
        Dump all parsed items in a formatted way.

        Args:
            safe_ascii (bool): If True, encode non-ASCII characters safely

        Returns:
            None (prints to stdout)
        """
        if not self._parsed_items:
            print("No parsed items to dump")
            return

        print(f"Dumping {len(self._parsed_items)} parsed items:")
        print("-" * 60)

        for item in self._parsed_items:
            if isinstance(item, tuple) and len(item) == 2:
                # Handle key-value pairs from dictionaries
                k, v = item
                if safe_ascii:
                    # Handle Unicode characters safely by encoding to ASCII with replacement
                    k_safe = str(k).encode('ascii', errors='replace').decode('ascii')
                    v_safe = str(v).encode('ascii', errors='replace').decode('ascii')
                    print(f"   {k_safe}: {v_safe}  (type: {type(v).__name__})")
                else:
                    print(f"   {k}: {v}  (type: {type(v).__name__})")
            else:
                # Handle non-tuple items
                if safe_ascii:
                    # Handle Unicode characters safely for non-dict items
                    item_safe = str(item).encode('ascii', errors='replace').decode('ascii')
                    print(f"   {item_safe}  (type: {type(item).__name__})")
                else:
                    print(f"   {item}  (type: {type(item).__name__})")

        print("-" * 60)

# ----------------------------------------------------------
def download_page(url: str, out_filename: str = '') -> tuple[bool, object]:
    """
    Download a page using browser-like headers

    Args:
        url (str): The full URL to download
        out_filename (str): Optional filename to save to. If empty, no file is written.

    Returns:
        tuple[bool, object]: (success, content/error_message)
            - (True, content) if successful
            - (False, error_message) if failed
    """
    # Ensure we have a full URL
    if not url.startswith('http'):
        return (False, "Please provide a full URL starting with http:// or https://")

    # Base headers from the sample request
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:139.0) Gecko/20100101 Firefox/139.0',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        # Remove Accept-Encoding to get uncompressed response
        # 'Accept-Encoding': 'gzip, deflate, br, zstd',
        'DNT': '1',
        'Sec-GPC': '1',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Sec-Fetch-User': '?1',
        'Priority': 'u=0, i',
        'TE': 'trailers'
    }

    try:
        # Create a session to handle cookies and connections properly
        session = requests.Session()
        session.headers.update(headers)

        # Make the GET request
        response = session.get(url, timeout=30)

        # Check if request was successful
        response.raise_for_status()

        # Save to file if filename provided
        if out_filename:
            with open(out_filename, 'w', encoding='utf-8') as f:
                f.write(response.text)

        return (True, response.text)

    except requests.exceptions.RequestException as e:
        return (False, f"Error downloading page: {e}")
    except Exception as e:
        return (False, f"Unexpected error: {e}")

# ----------------------------------------------------------
def process_gpt_input(input_str):
    """
    Process GPT input which can be:
    - Full URL: https://chatgpt.com/g/g-VgbIr9TQQ-ida-pro-c-sdk-and-decompiler
    - Conversation URL: https://chatgpt.com/g/g-m5lMeGifF-sql-expert-querygpt/c/682cd38c-ca8c-800d-b6e2-33b8ba763824
    - GPT ID: VgbIr9TQQ
    - Prefixed GPT ID: g-VgbIr9TQQ

    Returns:
        tuple: (full_url, gpt_id)
    """
    # Check if it's a full URL
    if input_str.startswith('https://') or input_str.startswith('http://'):
        gpt_id = parse_gpt_id(input_str)
        if not gpt_id:
            raise ValueError(f"Could not parse GPT ID from URL: {input_str}")

        # If it's a conversation URL (contains /c/), extract the base GPT URL
        if '/c/' in input_str:
            # Extract the GPT part before /c/
            base_url = input_str.split('/c/')[0]
            return (base_url, gpt_id)

        return (input_str, gpt_id)

    # Check if it's a prefixed GPT ID (g-XXXXXXXXX)
    if input_str.startswith('g-') and len(input_str) >= 11:
        # Extract just the 9-character ID after 'g-'
        gpt_id = input_str[2:11]  # Get exactly 9 characters after 'g-'
        url = f"https://chatgpt.com/g/{input_str}"
        return (url, gpt_id)

    # Assume it's a bare GPT ID (9 characters)
    if len(input_str) == 9:
        url = f"https://chatgpt.com/g/g-{input_str}"
        return (url, input_str)

    raise ValueError(f"Invalid GPT input format: {input_str}")

def generate_template(url, debug=False, dump=False):
    """
    Download and parse GPT data, then generate markdown template

    Args:
        url: Full GPT URL
        debug: Whether to save debug files (HTML and dump)
        dump: Whether to print parsed items to console

    Returns:
        tuple: (success, result_or_error)
            - (True, GenerateTemplateResult) if successful
            - (False, error_message) if failed
    """
    print(f"[DOWNLOAD] Fetching page from: {url}")
    # Download the page
    save_file = None
    if debug:
        save_file = "debug_download.html"
        print(f"[DEBUG] Will save HTML to: {save_file}")

    success, content = download_page(url, save_file)
    if not success:
        return (False, f"Download failed: {content}")

    print(f"[DOWNLOAD] Successfully downloaded {len(content)} bytes")

    # Parse the content
    print(f"[PARSE] Parsing GPT data...")
    parser = CustomGPTParser()
    success, error = parser.parse(content)
    if not success:
        return (False, f"Parsing failed: {error}")

    print(f"[PARSE] Successfully parsed {len(parser.get_parsed_items())} items")

    # Save dump if debug mode
    if debug:
        from io import StringIO
        old_stdout = sys.stdout
        sys.stdout = buffer = StringIO()
        parser.dump(safe_ascii=True)
        dump_content = buffer.getvalue()
        sys.stdout = old_stdout

        dump_file = "debug_dump.txt"
        with open(dump_file, 'w', encoding='utf-8') as f:
            f.write(dump_content)
        print(f"[DEBUG] Saved parsed data dump to: {dump_file}")

    # Extract required fields
    print(f"[EXTRACT] Extracting GPT metadata...")
    short_url = parser.get_str_value('short_url', 'UNKNOWN')
    profile_pic = parser.get_str_value('profile_picture_url', '')
    title = parser.get_title()
    description = parser.get_str_value('description', '')
    author_display_name = parser.get_author_display_name()

    print(f"[EXTRACT] Found:")
    print(f"  - Short URL: {short_url}")
    print(f"  - Title: {title}")
    print(f"  - Author: {author_display_name}")
    try:
        print(f"  - Description: {description[:50]}..." if len(description) > 50 else f"  - Description: {description}")
    except UnicodeEncodeError:
        # Handle special characters that can't be printed to console
        safe_desc = description.encode('ascii', errors='replace').decode('ascii')
        print(f"  - Description: {safe_desc[:50]}..." if len(safe_desc) > 50 else f"  - Description: {safe_desc}")
    print(f"  - Profile Pic: {'Yes' if profile_pic else 'No'}")

    # Dump parsed items if requested
    if dump:
        print("\n[DUMP] Parsed items:")
        parser.dump(safe_ascii=False)

    # Generate template
    template = TEMPLATE.format(
        short_url=short_url,
        profile_pic=profile_pic,
        title=title,
        description=description,
        author_display_name=author_display_name
    )

    # Extract GPT ID from short_url (remove 'g-' prefix if it exists)
    gpt_id = short_url[2:] if short_url.startswith('g-') else short_url

    return (True, GenerateTemplateResult(template, short_url, gpt_id, parser))

def process_response_file(filename, debug=False, dump=False):
    """
    Process a response file containing multiple GPT URLs/IDs

    Args:
        filename: Path to the response file
        debug: Whether to save debug files
        dump: Whether to dump parsed items

    Returns:
        tuple: (success_count, error_count)
    """
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            lines = f.readlines()
    except Exception as e:
        print(f"Error reading response file: {e}")
        return (0, 1)

    # Process each non-empty line
    inputs = [line.strip() for line in lines if line.strip() and not line.strip().startswith('#')]

    if not inputs:
        print(f"No valid inputs found in {filename}")
        return (0, 0)

    print(f"\n{'=' * 70}")
    print(f"PROCESSING RESPONSE FILE: {filename}")
    print(f"Found {len(inputs)} items to process")
    print(f"{'=' * 70}")

    success_count = 0
    error_count = 0

    for i, input_str in enumerate(inputs, 1):
        print(f"\n[ITEM {i}/{len(inputs)}] Processing: {input_str}")
        print("-" * 60)

        try:
            # Process input
            url, gpt_id = process_gpt_input(input_str)
            print(f"[PARSED] Full URL: {url}")
            print(f"[PARSED] GPT ID: {gpt_id}")

            # Generate template
            success, result = generate_template(url, debug, dump)
            if success:
                filename = f"{result.gpt_id}.md"
                with open(filename, 'w', encoding='utf-8') as f:
                    f.write(result.template)
                print(f"[SUCCESS] Template saved to: {filename}")

                # Save dump file if requested
                if dump:
                    dump_filename = f"{result.gpt_id}.txt"
                    from io import StringIO
                    old_stdout = sys.stdout
                    sys.stdout = buffer = StringIO()
                    result.parser.dump(safe_ascii=True)
                    dump_content = buffer.getvalue()
                    sys.stdout = old_stdout

                    with open(dump_filename, 'w', encoding='utf-8') as f:
                        f.write(dump_content)
                    print(f"[SUCCESS] Dump saved to: {dump_filename}")

                success_count += 1
            else:
                print(f"[FAILED] Error: {result}")
                error_count += 1

        except Exception as e:
            print(f"[ERROR] Exception: {e}")
            error_count += 1

    print(f"\n{'=' * 70}")
    print(f"RESPONSE FILE COMPLETE")
    print(f"Success: {success_count}, Errors: {error_count}")
    print(f"{'=' * 70}")

    return (success_count, error_count)

def main():
    parser = argparse.ArgumentParser(description='Generate markdown template for ChatGPT GPTs')
    parser.add_argument('input', nargs='?', help='GPT URL, GPT ID, g-prefixed GPT ID, or @response_file')
    parser.add_argument('--debug', action='store_true', help='Save debug files (HTML and dump)')
    parser.add_argument('--dump', action='store_true', help='Save parsed names and values to .txt file')

    args = parser.parse_args()

    # Check if input was provided
    if not args.input:
        parser.print_help()
        return 1

    try:
        # Check if input is a response file
        if args.input.startswith('@'):
            # Process response file
            filename = args.input[1:]  # Remove the @ prefix
            success_count, error_count = process_response_file(filename, args.debug, args.dump)
            return 0 if error_count == 0 else 1
        else:
            # Process single input
            print(f"\n[INPUT] Processing: {args.input}")
            url, gpt_id = process_gpt_input(args.input)
            print(f"[PARSED] Full URL: {url}")
            print(f"[PARSED] GPT ID: {gpt_id}")

            # Generate template
            success, result = generate_template(url, args.debug, args.dump)
            if success:
                # Save to file
                filename = f"{result.gpt_id}.md"
                with open(filename, 'w', encoding='utf-8') as f:
                    f.write(result.template)
                print(f"Template saved to: {filename}")

                # Save dump file if requested
                if args.dump:
                    dump_filename = f"{result.gpt_id}.txt"
                    from io import StringIO
                    old_stdout = sys.stdout
                    sys.stdout = buffer = StringIO()
                    result.parser.dump(safe_ascii=True)
                    dump_content = buffer.getvalue()
                    sys.stdout = old_stdout

                    with open(dump_filename, 'w', encoding='utf-8') as f:
                        f.write(dump_content)
                    print(f"Dump saved to: {dump_filename}")

                # Also print the template
                print("\nGenerated template:")
                print("=" * 50)
                try:
                    print(result.template)
                except UnicodeEncodeError:
                    # Handle special characters that can't be printed to console
                    safe_template = result.template.encode('ascii', errors='replace').decode('ascii')
                    print(safe_template)
            else:
                print(f"Error: {result}")
                return 1

            return 0

    except Exception as e:
        print(f"Error: {e}")
        return 1

if __name__ == "__main__":
    sys.exit(main())