dgx-spark-vllm-setup/examples/api_client.py

#!/usr/bin/env python3
"""
vLLM OpenAI-Compatible API Client Example
Demonstrates using vLLM's OpenAI-compatible API endpoints
"""

import requests
import json
from typing import Dict, List

class VLLMClient:
    """Simple client for vLLM OpenAI-compatible API"""

    def __init__(self, base_url: str = "http://localhost:8000"):
        self.base_url = base_url.rstrip('/')

    def list_models(self) -> List[Dict]:
        """List available models"""
        response = requests.get(f"{self.base_url}/v1/models")
        response.raise_for_status()
        return response.json()

    def complete(
        self,
        prompt: str,
        model: str = None,
        max_tokens: int = 100,
        temperature: float = 0.7,
        stream: bool = False
    ) -> Dict:
        """Generate completion"""

        # Get model name if not specified
        if model is None:
            models = self.list_models()
            model = models['data'][0]['id']

        payload = {
            "model": model,
            "prompt": prompt,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "stream": stream
        }

        response = requests.post(
            f"{self.base_url}/v1/completions",
            json=payload,
            headers={"Content-Type": "application/json"},
            stream=stream
        )
        response.raise_for_status()

        if stream:
            return response.iter_lines()
        else:
            return response.json()

    def chat(
        self,
        messages: List[Dict[str, str]],
        model: str = None,
        max_tokens: int = 100,
        temperature: float = 0.7,
        stream: bool = False
    ) -> Dict:
        """Generate chat completion"""

        # Get model name if not specified
        if model is None:
            models = self.list_models()
            model = models['data'][0]['id']

        payload = {
            "model": model,
            "messages": messages,
            "max_tokens": max_tokens,
            "temperature": temperature,
            "stream": stream
        }

        response = requests.post(
            f"{self.base_url}/v1/chat/completions",
            json=payload,
            headers={"Content-Type": "application/json"},
            stream=stream
        )
        response.raise_for_status()

        if stream:
            return response.iter_lines()
        else:
            return response.json()


def main():
    # Initialize client
    client = VLLMClient("http://localhost:8000")

    print("="*60)
    print("vLLM API Client Examples")
    print("="*60)

    # Example 1: List models
    print("\n1. Listing available models...")
    models = client.list_models()
    for model in models['data']:
        print(f"   - {model['id']}")

    # Example 2: Simple completion
    print("\n2. Simple completion...")
    result = client.complete(
        prompt="The capital of France is",
        max_tokens=10,
        temperature=0.0
    )
    print(f"   Prompt: The capital of France is")
    print(f"   Response: {result['choices'][0]['text']}")

    # Example 3: Chat completion
    print("\n3. Chat completion...")
    messages = [
        {"role": "system", "content": "You are a helpful AI assistant."},
        {"role": "user", "content": "What is the Blackwell GPU architecture?"}
    ]
    result = client.chat(
        messages=messages,
        max_tokens=100,
        temperature=0.7
    )
    print(f"   User: {messages[1]['content']}")
    print(f"   Assistant: {result['choices'][0]['message']['content']}")

    # Example 4: Streaming completion
    print("\n4. Streaming completion...")
    print("   Prompt: Write a short poem about AI")
    print("   Response: ", end="", flush=True)

    stream = client.complete(
        prompt="Write a short poem about AI",
        max_tokens=50,
        temperature=0.8,
        stream=True
    )

    for line in stream:
        if line:
            try:
                data = json.loads(line.decode('utf-8').removeprefix('data: '))
                if 'choices' in data and len(data['choices']) > 0:
                    token = data['choices'][0].get('text', '')
                    print(token, end="", flush=True)
            except (json.JSONDecodeError, AttributeError):
                pass

    print("\n")
    print("="*60)

if __name__ == "__main__":
    main()