make the google history wonderful format

This commit is contained in:
yichuan520030910320
2025-07-22 20:43:56 -07:00
parent 2a96d05b21
commit f3f5d91207
3 changed files with 17 additions and 16 deletions

View File

@@ -292,6 +292,8 @@ Once the index is built, you can ask questions like:
</details> </details>
Slack supporting soon! Stay tuned!
## 🖥️ Command Line Interface ## 🖥️ Command Line Interface
LEANN includes a powerful CLI for document processing and search. Perfect for quick document indexing and interactive chat. LEANN includes a powerful CLI for document processing and search. Perfect for quick document indexing and interactive chat.

View File

@@ -65,12 +65,14 @@ def create_leann_index_from_multiple_chrome_profiles(profile_dirs: List[Path], i
if not all_documents: if not all_documents:
print("No documents loaded from any source. Exiting.") print("No documents loaded from any source. Exiting.")
# highlight info that you need to close all chrome browser before running this script and high light the instruction!!
print("\033[91mYou need to close or quit all chrome browser before running this script\033[0m")
return None return None
print(f"\nTotal loaded {len(all_documents)} history documents from {len(profile_dirs)} profiles") print(f"\nTotal loaded {len(all_documents)} history documents from {len(profile_dirs)} profiles")
# Create text splitter with 256 chunk size # Create text splitter with 256 chunk size
text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25) text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128)
# Convert Documents to text strings and chunk them # Convert Documents to text strings and chunk them
all_texts = [] all_texts = []
@@ -78,7 +80,9 @@ def create_leann_index_from_multiple_chrome_profiles(profile_dirs: List[Path], i
# Split the document into chunks # Split the document into chunks
nodes = text_splitter.get_nodes_from_documents([doc]) nodes = text_splitter.get_nodes_from_documents([doc])
for node in nodes: for node in nodes:
all_texts.append(node.get_content()) text = node.get_content()
# text = '[Title] ' + doc.metadata["title"] + '\n' + text
all_texts.append(text)
print(f"Created {len(all_texts)} text chunks from {len(all_documents)} documents") print(f"Created {len(all_texts)} text chunks from {len(all_documents)} documents")
@@ -225,7 +229,7 @@ async def main():
parser = argparse.ArgumentParser(description='LEANN Chrome History Reader - Create and query browser history index') parser = argparse.ArgumentParser(description='LEANN Chrome History Reader - Create and query browser history index')
parser.add_argument('--chrome-profile', type=str, default=DEFAULT_CHROME_PROFILE, parser.add_argument('--chrome-profile', type=str, default=DEFAULT_CHROME_PROFILE,
help=f'Path to Chrome profile directory (default: {DEFAULT_CHROME_PROFILE}), usually you dont need to change this') help=f'Path to Chrome profile directory (default: {DEFAULT_CHROME_PROFILE}), usually you dont need to change this')
parser.add_argument('--index-dir', type=str, default="./chrome_history_index_leann_test", parser.add_argument('--index-dir', type=str, default="./all_google_new",
help='Directory to store the LEANN index (default: ./chrome_history_index_leann_test)') help='Directory to store the LEANN index (default: ./chrome_history_index_leann_test)')
parser.add_argument('--max-entries', type=int, default=1000, parser.add_argument('--max-entries', type=int, default=1000,
help='Maximum number of history entries to process (default: 1000)') help='Maximum number of history entries to process (default: 1000)')

View File

@@ -74,22 +74,17 @@ class ChromeHistoryReader(BaseReader):
# Create document content with metadata embedded in text # Create document content with metadata embedded in text
doc_content = f""" doc_content = f"""
[BROWSING HISTORY METADATA] [Title]: {title}
URL: {url} [URL of the page]: {url}
Title: {title} [Last visited time]: {last_visit}
Last Visit: {last_visit} [Visit times]: {visit_count}
Visit Count: {visit_count} [Typed times]: {typed_count}
Typed Count: {typed_count}
Hidden: {hidden}
[END METADATA]
Title: {title}
URL: {url}
Last visited: {last_visit}
""" """
# Create document with embedded metadata # Create document with embedded metadata
doc = Document(text=doc_content, metadata={}) doc = Document(text=doc_content, metadata={ "title": title[0:150]})
# if len(title) > 150:
# print(f"Title is too long: {title}")
docs.append(doc) docs.append(doc)
count += 1 count += 1