first commit

This commit is contained in:
2026-03-22 17:26:26 -04:00
commit c05cb71816
15 changed files with 2644 additions and 0 deletions

160
examples/api_client.py Normal file
View File

@@ -0,0 +1,160 @@
#!/usr/bin/env python3
"""
vLLM OpenAI-Compatible API Client Example
Demonstrates using vLLM's OpenAI-compatible API endpoints
"""
import requests
import json
from typing import Dict, List
class VLLMClient:
"""Simple client for vLLM OpenAI-compatible API"""
def __init__(self, base_url: str = "http://localhost:8000"):
self.base_url = base_url.rstrip('/')
def list_models(self) -> List[Dict]:
"""List available models"""
response = requests.get(f"{self.base_url}/v1/models")
response.raise_for_status()
return response.json()
def complete(
self,
prompt: str,
model: str = None,
max_tokens: int = 100,
temperature: float = 0.7,
stream: bool = False
) -> Dict:
"""Generate completion"""
# Get model name if not specified
if model is None:
models = self.list_models()
model = models['data'][0]['id']
payload = {
"model": model,
"prompt": prompt,
"max_tokens": max_tokens,
"temperature": temperature,
"stream": stream
}
response = requests.post(
f"{self.base_url}/v1/completions",
json=payload,
headers={"Content-Type": "application/json"},
stream=stream
)
response.raise_for_status()
if stream:
return response.iter_lines()
else:
return response.json()
def chat(
self,
messages: List[Dict[str, str]],
model: str = None,
max_tokens: int = 100,
temperature: float = 0.7,
stream: bool = False
) -> Dict:
"""Generate chat completion"""
# Get model name if not specified
if model is None:
models = self.list_models()
model = models['data'][0]['id']
payload = {
"model": model,
"messages": messages,
"max_tokens": max_tokens,
"temperature": temperature,
"stream": stream
}
response = requests.post(
f"{self.base_url}/v1/chat/completions",
json=payload,
headers={"Content-Type": "application/json"},
stream=stream
)
response.raise_for_status()
if stream:
return response.iter_lines()
else:
return response.json()
def main():
# Initialize client
client = VLLMClient("http://localhost:8000")
print("="*60)
print("vLLM API Client Examples")
print("="*60)
# Example 1: List models
print("\n1. Listing available models...")
models = client.list_models()
for model in models['data']:
print(f" - {model['id']}")
# Example 2: Simple completion
print("\n2. Simple completion...")
result = client.complete(
prompt="The capital of France is",
max_tokens=10,
temperature=0.0
)
print(f" Prompt: The capital of France is")
print(f" Response: {result['choices'][0]['text']}")
# Example 3: Chat completion
print("\n3. Chat completion...")
messages = [
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": "What is the Blackwell GPU architecture?"}
]
result = client.chat(
messages=messages,
max_tokens=100,
temperature=0.7
)
print(f" User: {messages[1]['content']}")
print(f" Assistant: {result['choices'][0]['message']['content']}")
# Example 4: Streaming completion
print("\n4. Streaming completion...")
print(" Prompt: Write a short poem about AI")
print(" Response: ", end="", flush=True)
stream = client.complete(
prompt="Write a short poem about AI",
max_tokens=50,
temperature=0.8,
stream=True
)
for line in stream:
if line:
try:
data = json.loads(line.decode('utf-8').removeprefix('data: '))
if 'choices' in data and len(data['choices']) > 0:
token = data['choices'][0].get('text', '')
print(token, end="", flush=True)
except (json.JSONDecodeError, AttributeError):
pass
print("\n")
print("="*60)
if __name__ == "__main__":
main()