# =========================================== # HTR API Configuration - Enhanced Version # =========================================== # --- Model Selection --- # Change these when switching models: VLLM_MODEL_PATH=/llm/models/Qwen3-VL-8B-AWQ4 VLLM_MODEL_NAME=Qwen3-VL-8B-AWQ4 # --- vLLM Server Settings --- VLLM_HOST=0.0.0.0 VLLM_PORT=8001 VLLM_GPU_UTIL=0.90 # Model-specific max context length VLLM_MAX_MODEL_LEN=2560 # --- PERFORMANCE BOOST: KV Cache Quantization --- # This is the #1 performance optimization! # Options: auto (default), fp8, fp8_e4m3, fp8_e5m2, int8 # fp8 = 2x more concurrent users, minimal quality loss (<0.1%) VLLM_KV_CACHE_DTYPE=fp8 # --- Sampling Parameters (Override model defaults!) --- # CRITICAL: These override model config.json defaults # Without these, vLLM uses model-specific defaults which may vary # Temperature: 0.0 = deterministic, higher = more creative # For HTR: 0.1 is perfect (consistent but not stuck in loops) SAMPLING_TEMPERATURE=0.1 # Max tokens to generate per request SAMPLING_MAX_TOKENS=500 # Top-p sampling (0.0-1.0) - use top 95% of probability mass SAMPLING_TOP_P=0.95 # Top-k sampling (0 = disabled) SAMPLING_TOP_K=0 # Presence penalty (0.0 = none, positive = encourage diversity) SAMPLING_PRESENCE_PENALTY=0.0 # Frequency penalty (0.0 = none, positive = reduce repetition) SAMPLING_FREQUENCY_PENALTY=0.0 # --- CRITICAL: Stop Sequences --- # This is WHY your 2B strips newlines in vLLM but not LM Studio! # Model config.json may have default stop tokens like ["\n", "\n\n"] # Empty = Override model defaults, preserve ALL newlines SAMPLING_STOP_SEQUENCES= # Alternative: Use only model's special tokens (if needed) # SAMPLING_STOP_SEQUENCES=<|endoftext|>,<|im_end|> # --- FastAPI Settings --- VLLM_ENDPOINT=http://127.0.0.1:8001/v1/chat/completions # --- Database --- DATABASE_PATH=/home/fenix/htr-api/htr_usage.db # --- Limits --- MAX_IMAGE_SIZE=10485760 # --- OpenRouter (for future cloud fallback) --- # OPENROUTER_API_KEY=your_key_here # OPENROUTER_ENDPOINT=https://openrouter.ai/api/v1/chat/completions # OPENROUTER_MODEL=qwen/qwen3-vl-8b-instruct # =========================================== # Model-Specific Presets (Copy to active settings above) # =========================================== # --- Qwen3-VL-2B-FP8 (Fastest: 161 tok/s) --- # VLLM_MODEL_PATH=/llm/models/Qwen3-VL-2B-FP8 # VLLM_MODEL_NAME=Qwen3-VL-2B-FP8 # VLLM_MAX_MODEL_LEN=8192 # VLLM_KV_CACHE_DTYPE=fp8 # Model size: 3.5GB | Concurrent: ~35-40 users with fp8 cache # --- Qwen3-VL-2B-BF16 (Fast: 121.8 tok/s) --- # VLLM_MODEL_PATH=/llm/models/Qwen3-VL-2B-BF16 # VLLM_MODEL_NAME=Qwen3-VL-2B-BF16 # VLLM_MAX_MODEL_LEN=8192 # VLLM_KV_CACHE_DTYPE=fp8 # Model size: 4.3GB | Concurrent: ~35-40 users with fp8 cache # --- Qwen3-VL-8B-AWQ4 (Balanced: 88 tok/s, 99.17% accuracy) --- # VLLM_MODEL_PATH=/llm/models/qwen3-vl-8b-awq4 # VLLM_MODEL_NAME=qwen3-vl-8b-awq4 # VLLM_MAX_MODEL_LEN=8192 # VLLM_KV_CACHE_DTYPE=fp8 # Model size: 7.5GB | Concurrent: ~25-30 users with fp8 cache # --- Qwen3-VL-8B-AWQ8 (Good: 61 tok/s) --- # VLLM_MODEL_PATH=/llm/models/Qwen3-VL-8B-AWQ-8bit # VLLM_MODEL_NAME=Qwen3-VL-8B-AWQ-8bit # VLLM_MAX_MODEL_LEN=8192 # VLLM_KV_CACHE_DTYPE=fp8 # Model size: 10.8GB | Concurrent: ~20-25 users with fp8 cache # --- Qwen3-VL-8B-BF16 (Highest quality: 42.1 tok/s) --- # VLLM_MODEL_PATH=/llm/models/qwen3-vl-8b-bf16 # VLLM_MODEL_NAME=qwen3-vl-8b-bf16 # VLLM_MAX_MODEL_LEN=8192 # VLLM_KV_CACHE_DTYPE=fp8 # Model size: 17.5GB | Concurrent: ~15-20 users with fp8 cache # =========================================== # Performance Impact Summary # =========================================== # KV_CACHE_DTYPE Impact on Concurrent Capacity (A6000 48GB): # ┌─────────────┬──────────────┬─────────────┬────────────┐ # │ Model │ auto (16bit) │ fp8 (8bit) │ Speedup │ # ├─────────────┼──────────────┼─────────────┼────────────┤ # │ 2B-FP8 │ ~18-20 users │ ~35-40 users│ 2.0x │ # │ 2B-BF16 │ ~16-18 users │ ~35-40 users│ 2.2x │ # │ 8B-AWQ4 │ ~14-16 users │ ~25-30 users│ 1.9x │ # │ 8B-AWQ8 │ ~12-14 users │ ~20-25 users│ 1.8x │ # │ 8B-BF16 │ ~8-10 users │ ~15-20 users│ 2.0x │ # └─────────────┴──────────────┴─────────────┴────────────┘ # Accuracy impact: <0.1% drop with fp8 cache (negligible) # Recommendation: ALWAYS use fp8 for production # Temperature Impact: # 0.0 = Fully deterministic (can cause loops) # 0.1 = Near-deterministic, robust (RECOMMENDED for HTR) # 0.3 = Slight variation # 0.7+ = Too creative for transcription # Stop Sequences Impact: # WITH newline stops: 97.10% char accuracy (paragraphs merged) # WITHOUT newline stops: 99.17% char accuracy (preserves formatting)