diff --git a/README.md b/README.md index 8f62ca8..edfd77f 100755 --- a/README.md +++ b/README.md @@ -292,6 +292,8 @@ Once the index is built, you can ask questions like: +Slack supporting soon! Stay tuned! + ## 🖥️ Command Line Interface LEANN includes a powerful CLI for document processing and search. Perfect for quick document indexing and interactive chat. diff --git a/examples/google_history_reader_leann.py b/examples/google_history_reader_leann.py index fd97d98..0098353 100644 --- a/examples/google_history_reader_leann.py +++ b/examples/google_history_reader_leann.py @@ -65,12 +65,14 @@ def create_leann_index_from_multiple_chrome_profiles(profile_dirs: List[Path], i if not all_documents: print("No documents loaded from any source. Exiting.") + # highlight info that you need to close all chrome browser before running this script and high light the instruction!! + print("\033[91mYou need to close or quit all chrome browser before running this script\033[0m") return None print(f"\nTotal loaded {len(all_documents)} history documents from {len(profile_dirs)} profiles") # Create text splitter with 256 chunk size - text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=25) + text_splitter = SentenceSplitter(chunk_size=256, chunk_overlap=128) # Convert Documents to text strings and chunk them all_texts = [] @@ -78,7 +80,9 @@ def create_leann_index_from_multiple_chrome_profiles(profile_dirs: List[Path], i # Split the document into chunks nodes = text_splitter.get_nodes_from_documents([doc]) for node in nodes: - all_texts.append(node.get_content()) + text = node.get_content() + # text = '[Title] ' + doc.metadata["title"] + '\n' + text + all_texts.append(text) print(f"Created {len(all_texts)} text chunks from {len(all_documents)} documents") @@ -225,7 +229,7 @@ async def main(): parser = argparse.ArgumentParser(description='LEANN Chrome History Reader - Create and query browser history index') parser.add_argument('--chrome-profile', type=str, default=DEFAULT_CHROME_PROFILE, help=f'Path to Chrome profile directory (default: {DEFAULT_CHROME_PROFILE}), usually you dont need to change this') - parser.add_argument('--index-dir', type=str, default="./chrome_history_index_leann_test", + parser.add_argument('--index-dir', type=str, default="./all_google_new", help='Directory to store the LEANN index (default: ./chrome_history_index_leann_test)') parser.add_argument('--max-entries', type=int, default=1000, help='Maximum number of history entries to process (default: 1000)') diff --git a/examples/history_data/history.py b/examples/history_data/history.py index 0258258..4f53534 100644 --- a/examples/history_data/history.py +++ b/examples/history_data/history.py @@ -74,22 +74,17 @@ class ChromeHistoryReader(BaseReader): # Create document content with metadata embedded in text doc_content = f""" -[BROWSING HISTORY METADATA] -URL: {url} -Title: {title} -Last Visit: {last_visit} -Visit Count: {visit_count} -Typed Count: {typed_count} -Hidden: {hidden} -[END METADATA] - -Title: {title} -URL: {url} -Last visited: {last_visit} +[Title]: {title} +[URL of the page]: {url} +[Last visited time]: {last_visit} +[Visit times]: {visit_count} +[Typed times]: {typed_count} """ # Create document with embedded metadata - doc = Document(text=doc_content, metadata={}) + doc = Document(text=doc_content, metadata={ "title": title[0:150]}) + # if len(title) > 150: + # print(f"Title is too long: {title}") docs.append(doc) count += 1