pen2post/Server/.env

# ===========================================
# HTR API Configuration - Enhanced Version
# ===========================================

# --- Model Selection ---
# Change these when switching models:
VLLM_MODEL_PATH=/llm/models/Qwen3-VL-8B-AWQ4
VLLM_MODEL_NAME=Qwen3-VL-8B-AWQ4

# --- vLLM Server Settings ---
VLLM_HOST=0.0.0.0
VLLM_PORT=8001
VLLM_GPU_UTIL=0.90

# Model-specific max context length
VLLM_MAX_MODEL_LEN=2560

# --- PERFORMANCE BOOST: KV Cache Quantization ---
# This is the #1 performance optimization!
# Options: auto (default), fp8, fp8_e4m3, fp8_e5m2, int8
# fp8 = 2x more concurrent users, minimal quality loss (<0.1%)
VLLM_KV_CACHE_DTYPE=fp8

# --- Sampling Parameters (Override model defaults!) ---
# CRITICAL: These override model config.json defaults
# Without these, vLLM uses model-specific defaults which may vary

# Temperature: 0.0 = deterministic, higher = more creative
# For HTR: 0.1 is perfect (consistent but not stuck in loops)
SAMPLING_TEMPERATURE=0.1

# Max tokens to generate per request
SAMPLING_MAX_TOKENS=500

# Top-p sampling (0.0-1.0) - use top 95% of probability mass
SAMPLING_TOP_P=0.95

# Top-k sampling (0 = disabled)
SAMPLING_TOP_K=0

# Presence penalty (0.0 = none, positive = encourage diversity)
SAMPLING_PRESENCE_PENALTY=0.0

# Frequency penalty (0.0 = none, positive = reduce repetition)
SAMPLING_FREQUENCY_PENALTY=0.0

# --- CRITICAL: Stop Sequences ---
# This is WHY your 2B strips newlines in vLLM but not LM Studio!
# Model config.json may have default stop tokens like ["\n", "\n\n"]
# Empty = Override model defaults, preserve ALL newlines
SAMPLING_STOP_SEQUENCES=

# Alternative: Use only model's special tokens (if needed)
# SAMPLING_STOP_SEQUENCES=<|endoftext|>,<|im_end|>

# --- FastAPI Settings ---
VLLM_ENDPOINT=http://127.0.0.1:8001/v1/chat/completions

# --- Database ---
DATABASE_PATH=/home/fenix/htr-api/htr_usage.db

# --- Limits ---
MAX_IMAGE_SIZE=10485760

# --- OpenRouter (for future cloud fallback) ---
# OPENROUTER_API_KEY=your_key_here
# OPENROUTER_ENDPOINT=https://openrouter.ai/api/v1/chat/completions
# OPENROUTER_MODEL=qwen/qwen3-vl-8b-instruct

# ===========================================
# Model-Specific Presets (Copy to active settings above)
# ===========================================

# --- Qwen3-VL-2B-FP8 (Fastest: 161 tok/s) ---
# VLLM_MODEL_PATH=/llm/models/Qwen3-VL-2B-FP8
# VLLM_MODEL_NAME=Qwen3-VL-2B-FP8
# VLLM_MAX_MODEL_LEN=8192
# VLLM_KV_CACHE_DTYPE=fp8
# Model size: 3.5GB | Concurrent: ~35-40 users with fp8 cache

# --- Qwen3-VL-2B-BF16 (Fast: 121.8 tok/s) ---
# VLLM_MODEL_PATH=/llm/models/Qwen3-VL-2B-BF16
# VLLM_MODEL_NAME=Qwen3-VL-2B-BF16
# VLLM_MAX_MODEL_LEN=8192
# VLLM_KV_CACHE_DTYPE=fp8
# Model size: 4.3GB | Concurrent: ~35-40 users with fp8 cache

# --- Qwen3-VL-8B-AWQ4 (Balanced: 88 tok/s, 99.17% accuracy) ---
# VLLM_MODEL_PATH=/llm/models/qwen3-vl-8b-awq4
# VLLM_MODEL_NAME=qwen3-vl-8b-awq4
# VLLM_MAX_MODEL_LEN=8192
# VLLM_KV_CACHE_DTYPE=fp8
# Model size: 7.5GB | Concurrent: ~25-30 users with fp8 cache

# --- Qwen3-VL-8B-AWQ8 (Good: 61 tok/s) ---
# VLLM_MODEL_PATH=/llm/models/Qwen3-VL-8B-AWQ-8bit
# VLLM_MODEL_NAME=Qwen3-VL-8B-AWQ-8bit
# VLLM_MAX_MODEL_LEN=8192
# VLLM_KV_CACHE_DTYPE=fp8
# Model size: 10.8GB | Concurrent: ~20-25 users with fp8 cache

# --- Qwen3-VL-8B-BF16 (Highest quality: 42.1 tok/s) ---
# VLLM_MODEL_PATH=/llm/models/qwen3-vl-8b-bf16
# VLLM_MODEL_NAME=qwen3-vl-8b-bf16
# VLLM_MAX_MODEL_LEN=8192
# VLLM_KV_CACHE_DTYPE=fp8
# Model size: 17.5GB | Concurrent: ~15-20 users with fp8 cache

# ===========================================
# Performance Impact Summary
# ===========================================

# KV_CACHE_DTYPE Impact on Concurrent Capacity (A6000 48GB):
# ┌─────────────┬──────────────┬─────────────┬────────────┐
# │ Model       │ auto (16bit) │ fp8 (8bit)  │ Speedup    │
# ├─────────────┼──────────────┼─────────────┼────────────┤
# │ 2B-FP8      │ ~18-20 users │ ~35-40 users│ 2.0x       │
# │ 2B-BF16     │ ~16-18 users │ ~35-40 users│ 2.2x       │
# │ 8B-AWQ4     │ ~14-16 users │ ~25-30 users│ 1.9x       │
# │ 8B-AWQ8     │ ~12-14 users │ ~20-25 users│ 1.8x       │
# │ 8B-BF16     │ ~8-10 users  │ ~15-20 users│ 2.0x       │
# └─────────────┴──────────────┴─────────────┴────────────┘

# Accuracy impact: <0.1% drop with fp8 cache (negligible)
# Recommendation: ALWAYS use fp8 for production

# Temperature Impact:
# 0.0   = Fully deterministic (can cause loops)
# 0.1   = Near-deterministic, robust (RECOMMENDED for HTR)
# 0.3   = Slight variation
# 0.7+  = Too creative for transcription

# Stop Sequences Impact:
# WITH newline stops:    97.10% char accuracy (paragraphs merged)
# WITHOUT newline stops: 99.17% char accuracy (preserves formatting)