161 lines
4.3 KiB
Python
161 lines
4.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
vLLM OpenAI-Compatible API Client Example
|
|
Demonstrates using vLLM's OpenAI-compatible API endpoints
|
|
"""
|
|
|
|
import requests
|
|
import json
|
|
from typing import Dict, List
|
|
|
|
class VLLMClient:
|
|
"""Simple client for vLLM OpenAI-compatible API"""
|
|
|
|
def __init__(self, base_url: str = "http://localhost:8000"):
|
|
self.base_url = base_url.rstrip('/')
|
|
|
|
def list_models(self) -> List[Dict]:
|
|
"""List available models"""
|
|
response = requests.get(f"{self.base_url}/v1/models")
|
|
response.raise_for_status()
|
|
return response.json()
|
|
|
|
def complete(
|
|
self,
|
|
prompt: str,
|
|
model: str = None,
|
|
max_tokens: int = 100,
|
|
temperature: float = 0.7,
|
|
stream: bool = False
|
|
) -> Dict:
|
|
"""Generate completion"""
|
|
|
|
# Get model name if not specified
|
|
if model is None:
|
|
models = self.list_models()
|
|
model = models['data'][0]['id']
|
|
|
|
payload = {
|
|
"model": model,
|
|
"prompt": prompt,
|
|
"max_tokens": max_tokens,
|
|
"temperature": temperature,
|
|
"stream": stream
|
|
}
|
|
|
|
response = requests.post(
|
|
f"{self.base_url}/v1/completions",
|
|
json=payload,
|
|
headers={"Content-Type": "application/json"},
|
|
stream=stream
|
|
)
|
|
response.raise_for_status()
|
|
|
|
if stream:
|
|
return response.iter_lines()
|
|
else:
|
|
return response.json()
|
|
|
|
def chat(
|
|
self,
|
|
messages: List[Dict[str, str]],
|
|
model: str = None,
|
|
max_tokens: int = 100,
|
|
temperature: float = 0.7,
|
|
stream: bool = False
|
|
) -> Dict:
|
|
"""Generate chat completion"""
|
|
|
|
# Get model name if not specified
|
|
if model is None:
|
|
models = self.list_models()
|
|
model = models['data'][0]['id']
|
|
|
|
payload = {
|
|
"model": model,
|
|
"messages": messages,
|
|
"max_tokens": max_tokens,
|
|
"temperature": temperature,
|
|
"stream": stream
|
|
}
|
|
|
|
response = requests.post(
|
|
f"{self.base_url}/v1/chat/completions",
|
|
json=payload,
|
|
headers={"Content-Type": "application/json"},
|
|
stream=stream
|
|
)
|
|
response.raise_for_status()
|
|
|
|
if stream:
|
|
return response.iter_lines()
|
|
else:
|
|
return response.json()
|
|
|
|
|
|
def main():
|
|
# Initialize client
|
|
client = VLLMClient("http://localhost:8000")
|
|
|
|
print("="*60)
|
|
print("vLLM API Client Examples")
|
|
print("="*60)
|
|
|
|
# Example 1: List models
|
|
print("\n1. Listing available models...")
|
|
models = client.list_models()
|
|
for model in models['data']:
|
|
print(f" - {model['id']}")
|
|
|
|
# Example 2: Simple completion
|
|
print("\n2. Simple completion...")
|
|
result = client.complete(
|
|
prompt="The capital of France is",
|
|
max_tokens=10,
|
|
temperature=0.0
|
|
)
|
|
print(f" Prompt: The capital of France is")
|
|
print(f" Response: {result['choices'][0]['text']}")
|
|
|
|
# Example 3: Chat completion
|
|
print("\n3. Chat completion...")
|
|
messages = [
|
|
{"role": "system", "content": "You are a helpful AI assistant."},
|
|
{"role": "user", "content": "What is the Blackwell GPU architecture?"}
|
|
]
|
|
result = client.chat(
|
|
messages=messages,
|
|
max_tokens=100,
|
|
temperature=0.7
|
|
)
|
|
print(f" User: {messages[1]['content']}")
|
|
print(f" Assistant: {result['choices'][0]['message']['content']}")
|
|
|
|
# Example 4: Streaming completion
|
|
print("\n4. Streaming completion...")
|
|
print(" Prompt: Write a short poem about AI")
|
|
print(" Response: ", end="", flush=True)
|
|
|
|
stream = client.complete(
|
|
prompt="Write a short poem about AI",
|
|
max_tokens=50,
|
|
temperature=0.8,
|
|
stream=True
|
|
)
|
|
|
|
for line in stream:
|
|
if line:
|
|
try:
|
|
data = json.loads(line.decode('utf-8').removeprefix('data: '))
|
|
if 'choices' in data and len(data['choices']) > 0:
|
|
token = data['choices'][0].get('text', '')
|
|
print(token, end="", flush=True)
|
|
except (json.JSONDecodeError, AttributeError):
|
|
pass
|
|
|
|
print("\n")
|
|
print("="*60)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|