first commit
This commit is contained in:
160
examples/api_client.py
Normal file
160
examples/api_client.py
Normal file
@@ -0,0 +1,160 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
vLLM OpenAI-Compatible API Client Example
|
||||
Demonstrates using vLLM's OpenAI-compatible API endpoints
|
||||
"""
|
||||
|
||||
import requests
|
||||
import json
|
||||
from typing import Dict, List
|
||||
|
||||
class VLLMClient:
|
||||
"""Simple client for vLLM OpenAI-compatible API"""
|
||||
|
||||
def __init__(self, base_url: str = "http://localhost:8000"):
|
||||
self.base_url = base_url.rstrip('/')
|
||||
|
||||
def list_models(self) -> List[Dict]:
|
||||
"""List available models"""
|
||||
response = requests.get(f"{self.base_url}/v1/models")
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def complete(
|
||||
self,
|
||||
prompt: str,
|
||||
model: str = None,
|
||||
max_tokens: int = 100,
|
||||
temperature: float = 0.7,
|
||||
stream: bool = False
|
||||
) -> Dict:
|
||||
"""Generate completion"""
|
||||
|
||||
# Get model name if not specified
|
||||
if model is None:
|
||||
models = self.list_models()
|
||||
model = models['data'][0]['id']
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": temperature,
|
||||
"stream": stream
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
f"{self.base_url}/v1/completions",
|
||||
json=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
stream=stream
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
if stream:
|
||||
return response.iter_lines()
|
||||
else:
|
||||
return response.json()
|
||||
|
||||
def chat(
|
||||
self,
|
||||
messages: List[Dict[str, str]],
|
||||
model: str = None,
|
||||
max_tokens: int = 100,
|
||||
temperature: float = 0.7,
|
||||
stream: bool = False
|
||||
) -> Dict:
|
||||
"""Generate chat completion"""
|
||||
|
||||
# Get model name if not specified
|
||||
if model is None:
|
||||
models = self.list_models()
|
||||
model = models['data'][0]['id']
|
||||
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": messages,
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": temperature,
|
||||
"stream": stream
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
f"{self.base_url}/v1/chat/completions",
|
||||
json=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
stream=stream
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
if stream:
|
||||
return response.iter_lines()
|
||||
else:
|
||||
return response.json()
|
||||
|
||||
|
||||
def main():
|
||||
# Initialize client
|
||||
client = VLLMClient("http://localhost:8000")
|
||||
|
||||
print("="*60)
|
||||
print("vLLM API Client Examples")
|
||||
print("="*60)
|
||||
|
||||
# Example 1: List models
|
||||
print("\n1. Listing available models...")
|
||||
models = client.list_models()
|
||||
for model in models['data']:
|
||||
print(f" - {model['id']}")
|
||||
|
||||
# Example 2: Simple completion
|
||||
print("\n2. Simple completion...")
|
||||
result = client.complete(
|
||||
prompt="The capital of France is",
|
||||
max_tokens=10,
|
||||
temperature=0.0
|
||||
)
|
||||
print(f" Prompt: The capital of France is")
|
||||
print(f" Response: {result['choices'][0]['text']}")
|
||||
|
||||
# Example 3: Chat completion
|
||||
print("\n3. Chat completion...")
|
||||
messages = [
|
||||
{"role": "system", "content": "You are a helpful AI assistant."},
|
||||
{"role": "user", "content": "What is the Blackwell GPU architecture?"}
|
||||
]
|
||||
result = client.chat(
|
||||
messages=messages,
|
||||
max_tokens=100,
|
||||
temperature=0.7
|
||||
)
|
||||
print(f" User: {messages[1]['content']}")
|
||||
print(f" Assistant: {result['choices'][0]['message']['content']}")
|
||||
|
||||
# Example 4: Streaming completion
|
||||
print("\n4. Streaming completion...")
|
||||
print(" Prompt: Write a short poem about AI")
|
||||
print(" Response: ", end="", flush=True)
|
||||
|
||||
stream = client.complete(
|
||||
prompt="Write a short poem about AI",
|
||||
max_tokens=50,
|
||||
temperature=0.8,
|
||||
stream=True
|
||||
)
|
||||
|
||||
for line in stream:
|
||||
if line:
|
||||
try:
|
||||
data = json.loads(line.decode('utf-8').removeprefix('data: '))
|
||||
if 'choices' in data and len(data['choices']) > 0:
|
||||
token = data['choices'][0].get('text', '')
|
||||
print(token, end="", flush=True)
|
||||
except (json.JSONDecodeError, AttributeError):
|
||||
pass
|
||||
|
||||
print("\n")
|
||||
print("="*60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user