Initial PoC commit

2026-01-28 20:39:59 -08:00
commit 94811ca7c1
18 changed files with 1831 additions and 0 deletions
--- a/Server/.env
+++ b/Server/.env
@@ -0,0 +1,135 @@
+# ===========================================
+# HTR API Configuration - Enhanced Version
+# ===========================================
+
+# --- Model Selection ---
+# Change these when switching models:
+VLLM_MODEL_PATH=/llm/models/Qwen3-VL-8B-AWQ4
+VLLM_MODEL_NAME=Qwen3-VL-8B-AWQ4
+
+# --- vLLM Server Settings ---
+VLLM_HOST=0.0.0.0
+VLLM_PORT=8001
+VLLM_GPU_UTIL=0.90
+
+# Model-specific max context length
+VLLM_MAX_MODEL_LEN=2560
+
+# --- PERFORMANCE BOOST: KV Cache Quantization ---
+# This is the #1 performance optimization!
+# Options: auto (default), fp8, fp8_e4m3, fp8_e5m2, int8
+# fp8 = 2x more concurrent users, minimal quality loss (<0.1%)
+VLLM_KV_CACHE_DTYPE=fp8
+
+# --- Sampling Parameters (Override model defaults!) ---
+# CRITICAL: These override model config.json defaults
+# Without these, vLLM uses model-specific defaults which may vary
+
+# Temperature: 0.0 = deterministic, higher = more creative
+# For HTR: 0.1 is perfect (consistent but not stuck in loops)
+SAMPLING_TEMPERATURE=0.1
+
+# Max tokens to generate per request
+SAMPLING_MAX_TOKENS=500
+
+# Top-p sampling (0.0-1.0) - use top 95% of probability mass
+SAMPLING_TOP_P=0.95
+
+# Top-k sampling (0 = disabled)
+SAMPLING_TOP_K=0
+
+# Presence penalty (0.0 = none, positive = encourage diversity)
+SAMPLING_PRESENCE_PENALTY=0.0
+
+# Frequency penalty (0.0 = none, positive = reduce repetition)
+SAMPLING_FREQUENCY_PENALTY=0.0
+
+# --- CRITICAL: Stop Sequences ---
+# This is WHY your 2B strips newlines in vLLM but not LM Studio!
+# Model config.json may have default stop tokens like ["\n", "\n\n"]
+# Empty = Override model defaults, preserve ALL newlines
+SAMPLING_STOP_SEQUENCES=
+
+# Alternative: Use only model's special tokens (if needed)
+# SAMPLING_STOP_SEQUENCES=<|endoftext|>,<|im_end|>
+
+# --- FastAPI Settings ---
+VLLM_ENDPOINT=http://127.0.0.1:8001/v1/chat/completions
+
+# --- Database ---
+DATABASE_PATH=/home/fenix/htr-api/htr_usage.db
+
+# --- Limits ---
+MAX_IMAGE_SIZE=10485760
+
+# --- OpenRouter (for future cloud fallback) ---
+# OPENROUTER_API_KEY=your_key_here
+# OPENROUTER_ENDPOINT=https://openrouter.ai/api/v1/chat/completions
+# OPENROUTER_MODEL=qwen/qwen3-vl-8b-instruct
+
+# ===========================================
+# Model-Specific Presets (Copy to active settings above)
+# ===========================================
+
+# --- Qwen3-VL-2B-FP8 (Fastest: 161 tok/s) ---
+# VLLM_MODEL_PATH=/llm/models/Qwen3-VL-2B-FP8
+# VLLM_MODEL_NAME=Qwen3-VL-2B-FP8
+# VLLM_MAX_MODEL_LEN=8192
+# VLLM_KV_CACHE_DTYPE=fp8
+# Model size: 3.5GB | Concurrent: ~35-40 users with fp8 cache
+
+# --- Qwen3-VL-2B-BF16 (Fast: 121.8 tok/s) ---
+# VLLM_MODEL_PATH=/llm/models/Qwen3-VL-2B-BF16
+# VLLM_MODEL_NAME=Qwen3-VL-2B-BF16
+# VLLM_MAX_MODEL_LEN=8192
+# VLLM_KV_CACHE_DTYPE=fp8
+# Model size: 4.3GB | Concurrent: ~35-40 users with fp8 cache
+
+# --- Qwen3-VL-8B-AWQ4 (Balanced: 88 tok/s, 99.17% accuracy) ---
+# VLLM_MODEL_PATH=/llm/models/qwen3-vl-8b-awq4
+# VLLM_MODEL_NAME=qwen3-vl-8b-awq4
+# VLLM_MAX_MODEL_LEN=8192
+# VLLM_KV_CACHE_DTYPE=fp8
+# Model size: 7.5GB | Concurrent: ~25-30 users with fp8 cache
+
+# --- Qwen3-VL-8B-AWQ8 (Good: 61 tok/s) ---
+# VLLM_MODEL_PATH=/llm/models/Qwen3-VL-8B-AWQ-8bit
+# VLLM_MODEL_NAME=Qwen3-VL-8B-AWQ-8bit
+# VLLM_MAX_MODEL_LEN=8192
+# VLLM_KV_CACHE_DTYPE=fp8
+# Model size: 10.8GB | Concurrent: ~20-25 users with fp8 cache
+
+# --- Qwen3-VL-8B-BF16 (Highest quality: 42.1 tok/s) ---
+# VLLM_MODEL_PATH=/llm/models/qwen3-vl-8b-bf16
+# VLLM_MODEL_NAME=qwen3-vl-8b-bf16
+# VLLM_MAX_MODEL_LEN=8192
+# VLLM_KV_CACHE_DTYPE=fp8
+# Model size: 17.5GB | Concurrent: ~15-20 users with fp8 cache
+
+# ===========================================
+# Performance Impact Summary
+# ===========================================
+
+# KV_CACHE_DTYPE Impact on Concurrent Capacity (A6000 48GB):
+# ┌─────────────┬──────────────┬─────────────┬────────────┐
+# │ Model       │ auto (16bit) │ fp8 (8bit)  │ Speedup    │
+# ├─────────────┼──────────────┼─────────────┼────────────┤
+# │ 2B-FP8      │ ~18-20 users │ ~35-40 users│ 2.0x       │
+# │ 2B-BF16     │ ~16-18 users │ ~35-40 users│ 2.2x       │
+# │ 8B-AWQ4     │ ~14-16 users │ ~25-30 users│ 1.9x       │
+# │ 8B-AWQ8     │ ~12-14 users │ ~20-25 users│ 1.8x       │
+# │ 8B-BF16     │ ~8-10 users  │ ~15-20 users│ 2.0x       │
+# └─────────────┴──────────────┴─────────────┴────────────┘
+
+# Accuracy impact: <0.1% drop with fp8 cache (negligible)
+# Recommendation: ALWAYS use fp8 for production
+
+# Temperature Impact:
+# 0.0   = Fully deterministic (can cause loops)
+# 0.1   = Near-deterministic, robust (RECOMMENDED for HTR)
+# 0.3   = Slight variation
+# 0.7+  = Too creative for transcription
+
+# Stop Sequences Impact:
+# WITH newline stops:    97.10% char accuracy (paragraphs merged)
+# WITHOUT newline stops: 99.17% char accuracy (preserves formatting)