add gpt oss! serve your RAG using ollama
This commit is contained in:
Submodule packages/leann-backend-diskann/third_party/DiskANN updated: 67a2611ad1...af2a26481e
@@ -489,11 +489,35 @@ class OllamaChat(LLMInterface):
|
||||
import requests
|
||||
|
||||
full_url = f"{self.host}/api/generate"
|
||||
|
||||
# Handle thinking budget for reasoning models
|
||||
options = kwargs.copy()
|
||||
thinking_budget = kwargs.get("thinking_budget")
|
||||
if thinking_budget:
|
||||
# Remove thinking_budget from options as it's not a standard Ollama option
|
||||
options.pop("thinking_budget", None)
|
||||
# Only apply reasoning parameters to models that support it
|
||||
reasoning_supported_models = [
|
||||
"gpt-oss:20b",
|
||||
"gpt-oss:120b",
|
||||
"deepseek-r1",
|
||||
"deepseek-coder",
|
||||
]
|
||||
|
||||
if thinking_budget in ["low", "medium", "high"]:
|
||||
if any(model in self.model.lower() for model in reasoning_supported_models):
|
||||
options["reasoning"] = {"effort": thinking_budget, "exclude": False}
|
||||
logger.info(f"Applied reasoning effort={thinking_budget} to model {self.model}")
|
||||
else:
|
||||
logger.warning(
|
||||
f"Thinking budget '{thinking_budget}' requested but model '{self.model}' may not support reasoning parameters. Proceeding without reasoning."
|
||||
)
|
||||
|
||||
payload = {
|
||||
"model": self.model,
|
||||
"prompt": prompt,
|
||||
"stream": False, # Keep it simple for now
|
||||
"options": kwargs,
|
||||
"options": options,
|
||||
}
|
||||
logger.debug(f"Sending request to Ollama: {payload}")
|
||||
try:
|
||||
@@ -684,11 +708,38 @@ class OpenAIChat(LLMInterface):
|
||||
params = {
|
||||
"model": self.model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": kwargs.get("max_tokens", 1000),
|
||||
"temperature": kwargs.get("temperature", 0.7),
|
||||
**{k: v for k, v in kwargs.items() if k not in ["max_tokens", "temperature"]},
|
||||
}
|
||||
|
||||
# Handle max_tokens vs max_completion_tokens based on model
|
||||
max_tokens = kwargs.get("max_tokens", 1000)
|
||||
if "o3" in self.model or "o4" in self.model or "o1" in self.model:
|
||||
# o-series models use max_completion_tokens
|
||||
params["max_completion_tokens"] = max_tokens
|
||||
params["temperature"] = 1.0
|
||||
else:
|
||||
# Other models use max_tokens
|
||||
params["max_tokens"] = max_tokens
|
||||
|
||||
# Handle thinking budget for reasoning models
|
||||
thinking_budget = kwargs.get("thinking_budget")
|
||||
if thinking_budget and thinking_budget in ["low", "medium", "high"]:
|
||||
# Check if this is an o-series model (partial match for model names)
|
||||
o_series_models = ["o3", "o3-mini", "o4-mini", "o1", "o3-pro", "o3-deep-research"]
|
||||
if any(model in self.model for model in o_series_models):
|
||||
# Use the correct OpenAI reasoning parameter format
|
||||
params["reasoning_effort"] = thinking_budget
|
||||
logger.info(f"Applied reasoning_effort={thinking_budget} to model {self.model}")
|
||||
else:
|
||||
logger.warning(
|
||||
f"Thinking budget '{thinking_budget}' requested but model '{self.model}' may not support reasoning parameters. Proceeding without reasoning."
|
||||
)
|
||||
|
||||
# Add other kwargs (excluding thinking_budget as it's handled above)
|
||||
for k, v in kwargs.items():
|
||||
if k not in ["max_tokens", "temperature", "thinking_budget"]:
|
||||
params[k] = v
|
||||
|
||||
logger.info(f"Sending request to OpenAI with model {self.model}")
|
||||
|
||||
try:
|
||||
|
||||
@@ -125,6 +125,13 @@ Examples:
|
||||
choices=["global", "local", "proportional"],
|
||||
default="global",
|
||||
)
|
||||
ask_parser.add_argument(
|
||||
"--thinking-budget",
|
||||
type=str,
|
||||
choices=["low", "medium", "high"],
|
||||
default=None,
|
||||
help="Thinking budget for reasoning models (low/medium/high). Supported by GPT-Oss:20b and other reasoning models.",
|
||||
)
|
||||
|
||||
# List command
|
||||
subparsers.add_parser("list", help="List all indexes")
|
||||
@@ -308,6 +315,11 @@ Examples:
|
||||
if not user_input:
|
||||
continue
|
||||
|
||||
# Prepare LLM kwargs with thinking budget if specified
|
||||
llm_kwargs = {}
|
||||
if args.thinking_budget:
|
||||
llm_kwargs["thinking_budget"] = args.thinking_budget
|
||||
|
||||
response = chat.ask(
|
||||
user_input,
|
||||
top_k=args.top_k,
|
||||
@@ -316,11 +328,17 @@ Examples:
|
||||
prune_ratio=args.prune_ratio,
|
||||
recompute_embeddings=args.recompute_embeddings,
|
||||
pruning_strategy=args.pruning_strategy,
|
||||
llm_kwargs=llm_kwargs,
|
||||
)
|
||||
print(f"LEANN: {response}")
|
||||
else:
|
||||
query = input("Enter your question: ").strip()
|
||||
if query:
|
||||
# Prepare LLM kwargs with thinking budget if specified
|
||||
llm_kwargs = {}
|
||||
if args.thinking_budget:
|
||||
llm_kwargs["thinking_budget"] = args.thinking_budget
|
||||
|
||||
response = chat.ask(
|
||||
query,
|
||||
top_k=args.top_k,
|
||||
@@ -329,6 +347,7 @@ Examples:
|
||||
prune_ratio=args.prune_ratio,
|
||||
recompute_embeddings=args.recompute_embeddings,
|
||||
pruning_strategy=args.pruning_strategy,
|
||||
llm_kwargs=llm_kwargs,
|
||||
)
|
||||
print(f"LEANN: {response}")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user