Display context chunks in ask and search results (#149)

* Printing querying time

* Adding source name to chunks

Adding source name as metadata to chunks, then printing the sources when searching

* Printing the context provided to LLM

To check the data transmitted to the LLMs : display the relevance, ID, content, and source of each sent chunk.

* Correcting source as metadata for chunks

* Applying ruff format

* Applying Ruff formatting

* Ruff formatting
This commit is contained in:
CelineNi2
2025-10-24 00:03:59 +02:00
committed by GitHub
parent ab251ab751
commit abf312d998
2 changed files with 25 additions and 3 deletions

View File

@@ -1236,6 +1236,17 @@ class LeannChat:
"Please provide the best answer you can based on this context and your knowledge." "Please provide the best answer you can based on this context and your knowledge."
) )
print("The context provided to the LLM is:")
print(f"{'Relevance':<10} | {'Chunk id':<10} | {'Content':<60} | {'Source':<80}")
print("-" * 150)
for r in results:
chunk_relevance = f"{r.score:.3f}"
chunk_id = r.id
chunk_content = r.text[:60]
chunk_source = r.metadata.get("source", "")[:80]
print(
f"{chunk_relevance:<10} | {chunk_id:<10} | {chunk_content:<60} | {chunk_source:<80}"
)
ask_time = time.time() ask_time = time.time()
ans = self.llm.ask(prompt, **llm_kwargs) ans = self.llm.ask(prompt, **llm_kwargs)
ask_time = time.time() - ask_time ask_time = time.time() - ask_time

View File

@@ -1,5 +1,6 @@
import argparse import argparse
import asyncio import asyncio
import time
from pathlib import Path from pathlib import Path
from typing import Any, Optional, Union from typing import Any, Optional, Union
@@ -1186,6 +1187,7 @@ Examples:
for doc in other_docs: for doc in other_docs:
file_path = doc.metadata.get("file_path", "") file_path = doc.metadata.get("file_path", "")
if file_filter(file_path): if file_filter(file_path):
doc.metadata["source"] = file_path
filtered_docs.append(doc) filtered_docs.append(doc)
documents.extend(filtered_docs) documents.extend(filtered_docs)
@@ -1290,7 +1292,10 @@ Examples:
nodes = parser.get_nodes_from_documents([doc]) nodes = parser.get_nodes_from_documents([doc])
for node in nodes: for node in nodes:
all_texts.append(node.get_content()) text_with_source = (
"Chunk source:" + source_path + "\n" + node.get_content().replace("\n", " ")
)
all_texts.append(text_with_source)
print(f"Loaded {len(documents)} documents, {len(all_texts)} chunks") print(f"Loaded {len(documents)} documents, {len(all_texts)} chunks")
return all_texts return all_texts
@@ -1388,8 +1393,10 @@ Examples:
num_threads=args.num_threads, num_threads=args.num_threads,
) )
for chunk_text in all_texts: for chunk_text_with_source in all_texts:
builder.add_text(chunk_text) chunk_source = chunk_text_with_source.split("\n")[0].split(":")[1]
chunk_text = chunk_text_with_source.split("\n")[1]
builder.add_text(chunk_text, {"source": chunk_source})
builder.build_index(index_path) builder.build_index(index_path)
print(f"Index built at {index_path}") print(f"Index built at {index_path}")
@@ -1511,6 +1518,7 @@ Examples:
for i, result in enumerate(results, 1): for i, result in enumerate(results, 1):
print(f"{i}. Score: {result.score:.3f}") print(f"{i}. Score: {result.score:.3f}")
print(f" {result.text[:200]}...") print(f" {result.text[:200]}...")
print(f" Source: {result.metadata.get('source', '')}")
print() print()
async def ask_questions(self, args): async def ask_questions(self, args):
@@ -1542,6 +1550,7 @@ Examples:
llm_kwargs["thinking_budget"] = args.thinking_budget llm_kwargs["thinking_budget"] = args.thinking_budget
def _ask_once(prompt: str) -> None: def _ask_once(prompt: str) -> None:
query_start_time = time.time()
response = chat.ask( response = chat.ask(
prompt, prompt,
top_k=args.top_k, top_k=args.top_k,
@@ -1552,7 +1561,9 @@ Examples:
pruning_strategy=args.pruning_strategy, pruning_strategy=args.pruning_strategy,
llm_kwargs=llm_kwargs, llm_kwargs=llm_kwargs,
) )
query_completion_time = time.time() - query_start_time
print(f"LEANN: {response}") print(f"LEANN: {response}")
print(f"The query took {query_completion_time:.3f} seconds to finish")
initial_query = (args.query or "").strip() initial_query = (args.query or "").strip()