All Tutorials
API

🌊 Streaming Responses

Display AI responses in real-time as they're generated for better UX.

12 min Intermediate
1

Why Use Streaming?

Streaming lets you display AI-generated text token by token as it's produced, instead of waiting for the complete response.

Without Streaming

  • Wait 3-10 seconds for full response
  • Users see loading spinner
  • All-or-nothing result
  • Feels slow and unresponsive

With Streaming

  • First token in ~200ms
  • Text appears word by word
  • Instant feedback
  • Feels fast and interactive
2

Enable Streaming in API Calls

Add "stream": true to your API request:

cURL
curl https://api.vigthoria.io/v1/chat/completions \
  -H "Authorization: Bearer YOUR_API_KEY" \
  -H "Content-Type: application/json" \
  -d '{
    "model": "vigthoria-creative-v2",
    "messages": [{"role": "user", "content": "Write a poem about technology"}],
    "stream": true
  }'

The response comes as Server-Sent Events (SSE):

data: {"choices":[{"delta":{"content":"In"}}]}

data: {"choices":[{"delta":{"content":" circuits"}}]}

data: {"choices":[{"delta":{"content":" deep"}}]}

data: [DONE]
3

Handle Streaming in JavaScript

Browser (Fetch API)
async function streamChat(prompt) {
  const response = await fetch('https://api.vigthoria.io/v1/chat/completions', {
    method: 'POST',
    headers: {
      'Authorization': `Bearer ${API_KEY}`,
      'Content-Type': 'application/json',
    },
    body: JSON.stringify({
      model: 'vigthoria-creative-v2',
      messages: [{ role: 'user', content: prompt }],
      stream: true
    })
  });

  const reader = response.body.getReader();
  const decoder = new TextDecoder();
  let fullText = '';

  while (true) {
    const { done, value } = await reader.read();
    if (done) break;
    
    const chunk = decoder.decode(value);
    const lines = chunk.split('\n');
    
    for (const line of lines) {
      if (line.startsWith('data: ') && line !== 'data: [DONE]') {
        const json = JSON.parse(line.slice(6));
        const content = json.choices[0]?.delta?.content || '';
        fullText += content;
        
        // Update UI in real-time
        document.getElementById('output').textContent = fullText;
      }
    }
  }
  
  return fullText;
}

// Usage
streamChat('Explain quantum computing');
4

Handle Streaming in Python

Python (requests)
import requests
import json

def stream_chat(prompt):
    response = requests.post(
        'https://api.vigthoria.io/v1/chat/completions',
        headers={
            'Authorization': f'Bearer {API_KEY}',
            'Content-Type': 'application/json',
        },
        json={
            'model': 'vigthoria-creative-v2',
            'messages': [{'role': 'user', 'content': prompt}],
            'stream': True
        },
        stream=True
    )
    
    full_text = ''
    
    for line in response.iter_lines():
        if line:
            line = line.decode('utf-8')
            if line.startswith('data: ') and line != 'data: [DONE]':
                data = json.loads(line[6:])
                content = data['choices'][0].get('delta', {}).get('content', '')
                full_text += content
                print(content, end='', flush=True)  # Real-time output
    
    print()  # Newline after stream ends
    return full_text

# Usage
result = stream_chat('Write a haiku about AI')
Python (Async with aiohttp)
import aiohttp
import asyncio
import json

async def stream_chat_async(prompt):
    async with aiohttp.ClientSession() as session:
        async with session.post(
            'https://api.vigthoria.io/v1/chat/completions',
            headers={
                'Authorization': f'Bearer {API_KEY}',
                'Content-Type': 'application/json',
            },
            json={
                'model': 'vigthoria-creative-v2',
                'messages': [{'role': 'user', 'content': prompt}],
                'stream': True
            }
        ) as response:
            full_text = ''
            async for line in response.content:
                line = line.decode('utf-8').strip()
                if line.startswith('data: ') and line != 'data: [DONE]':
                    data = json.loads(line[6:])
                    content = data['choices'][0].get('delta', {}).get('content', '')
                    full_text += content
                    yield content  # Yield each chunk

# Usage
async def main():
    async for chunk in stream_chat_async('Explain machine learning'):
        print(chunk, end='', flush=True)

asyncio.run(main())
5

React Implementation

React Component
import { useState } from 'react';

function StreamingChat() {
  const [response, setResponse] = useState('');
  const [loading, setLoading] = useState(false);

  async function handleSubmit(prompt) {
    setLoading(true);
    setResponse('');

    const res = await fetch('/api/chat', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({ prompt, stream: true })
    });

    const reader = res.body.getReader();
    const decoder = new TextDecoder();

    while (true) {
      const { done, value } = await reader.read();
      if (done) break;

      const chunk = decoder.decode(value);
      const lines = chunk.split('\n');

      for (const line of lines) {
        if (line.startsWith('data: ') && line !== 'data: [DONE]') {
          const json = JSON.parse(line.slice(6));
          const content = json.choices[0]?.delta?.content || '';
          setResponse(prev => prev + content);
        }
      }
    }

    setLoading(false);
  }

  return (
    <div>
      <button onClick={() => handleSubmit('Hello!')}>
        {loading ? 'Generating...' : 'Send'}
      </button>
      <div className="response">{response}</div>
    </div>
  );
}
6

Best Practices

Streaming Tips
  • Add a cursor animation — Show a blinking cursor while streaming
  • Handle errors gracefully — Stream can be interrupted; catch and display errors
  • Implement cancel — Let users abort long generations with AbortController
  • Buffer for markdown — If rendering markdown, buffer slightly to avoid partial syntax
  • Track tokens — Count tokens as they stream for usage tracking
Cancellation Example
const controller = new AbortController();

// Start streaming
fetch(url, { 
  signal: controller.signal,
  // ... other options
});

// Cancel button handler
cancelButton.onclick = () => controller.abort();