Files
pen2post/Server/.env
2026-01-28 20:39:59 -08:00

136 lines
5.0 KiB
Bash

# ===========================================
# HTR API Configuration - Enhanced Version
# ===========================================
# --- Model Selection ---
# Change these when switching models:
VLLM_MODEL_PATH=/llm/models/Qwen3-VL-8B-AWQ4
VLLM_MODEL_NAME=Qwen3-VL-8B-AWQ4
# --- vLLM Server Settings ---
VLLM_HOST=0.0.0.0
VLLM_PORT=8001
VLLM_GPU_UTIL=0.90
# Model-specific max context length
VLLM_MAX_MODEL_LEN=2560
# --- PERFORMANCE BOOST: KV Cache Quantization ---
# This is the #1 performance optimization!
# Options: auto (default), fp8, fp8_e4m3, fp8_e5m2, int8
# fp8 = 2x more concurrent users, minimal quality loss (<0.1%)
VLLM_KV_CACHE_DTYPE=fp8
# --- Sampling Parameters (Override model defaults!) ---
# CRITICAL: These override model config.json defaults
# Without these, vLLM uses model-specific defaults which may vary
# Temperature: 0.0 = deterministic, higher = more creative
# For HTR: 0.1 is perfect (consistent but not stuck in loops)
SAMPLING_TEMPERATURE=0.1
# Max tokens to generate per request
SAMPLING_MAX_TOKENS=500
# Top-p sampling (0.0-1.0) - use top 95% of probability mass
SAMPLING_TOP_P=0.95
# Top-k sampling (0 = disabled)
SAMPLING_TOP_K=0
# Presence penalty (0.0 = none, positive = encourage diversity)
SAMPLING_PRESENCE_PENALTY=0.0
# Frequency penalty (0.0 = none, positive = reduce repetition)
SAMPLING_FREQUENCY_PENALTY=0.0
# --- CRITICAL: Stop Sequences ---
# This is WHY your 2B strips newlines in vLLM but not LM Studio!
# Model config.json may have default stop tokens like ["\n", "\n\n"]
# Empty = Override model defaults, preserve ALL newlines
SAMPLING_STOP_SEQUENCES=
# Alternative: Use only model's special tokens (if needed)
# SAMPLING_STOP_SEQUENCES=<|endoftext|>,<|im_end|>
# --- FastAPI Settings ---
VLLM_ENDPOINT=http://127.0.0.1:8001/v1/chat/completions
# --- Database ---
DATABASE_PATH=/home/fenix/htr-api/htr_usage.db
# --- Limits ---
MAX_IMAGE_SIZE=10485760
# --- OpenRouter (for future cloud fallback) ---
# OPENROUTER_API_KEY=your_key_here
# OPENROUTER_ENDPOINT=https://openrouter.ai/api/v1/chat/completions
# OPENROUTER_MODEL=qwen/qwen3-vl-8b-instruct
# ===========================================
# Model-Specific Presets (Copy to active settings above)
# ===========================================
# --- Qwen3-VL-2B-FP8 (Fastest: 161 tok/s) ---
# VLLM_MODEL_PATH=/llm/models/Qwen3-VL-2B-FP8
# VLLM_MODEL_NAME=Qwen3-VL-2B-FP8
# VLLM_MAX_MODEL_LEN=8192
# VLLM_KV_CACHE_DTYPE=fp8
# Model size: 3.5GB | Concurrent: ~35-40 users with fp8 cache
# --- Qwen3-VL-2B-BF16 (Fast: 121.8 tok/s) ---
# VLLM_MODEL_PATH=/llm/models/Qwen3-VL-2B-BF16
# VLLM_MODEL_NAME=Qwen3-VL-2B-BF16
# VLLM_MAX_MODEL_LEN=8192
# VLLM_KV_CACHE_DTYPE=fp8
# Model size: 4.3GB | Concurrent: ~35-40 users with fp8 cache
# --- Qwen3-VL-8B-AWQ4 (Balanced: 88 tok/s, 99.17% accuracy) ---
# VLLM_MODEL_PATH=/llm/models/qwen3-vl-8b-awq4
# VLLM_MODEL_NAME=qwen3-vl-8b-awq4
# VLLM_MAX_MODEL_LEN=8192
# VLLM_KV_CACHE_DTYPE=fp8
# Model size: 7.5GB | Concurrent: ~25-30 users with fp8 cache
# --- Qwen3-VL-8B-AWQ8 (Good: 61 tok/s) ---
# VLLM_MODEL_PATH=/llm/models/Qwen3-VL-8B-AWQ-8bit
# VLLM_MODEL_NAME=Qwen3-VL-8B-AWQ-8bit
# VLLM_MAX_MODEL_LEN=8192
# VLLM_KV_CACHE_DTYPE=fp8
# Model size: 10.8GB | Concurrent: ~20-25 users with fp8 cache
# --- Qwen3-VL-8B-BF16 (Highest quality: 42.1 tok/s) ---
# VLLM_MODEL_PATH=/llm/models/qwen3-vl-8b-bf16
# VLLM_MODEL_NAME=qwen3-vl-8b-bf16
# VLLM_MAX_MODEL_LEN=8192
# VLLM_KV_CACHE_DTYPE=fp8
# Model size: 17.5GB | Concurrent: ~15-20 users with fp8 cache
# ===========================================
# Performance Impact Summary
# ===========================================
# KV_CACHE_DTYPE Impact on Concurrent Capacity (A6000 48GB):
# ┌─────────────┬──────────────┬─────────────┬────────────┐
# │ Model │ auto (16bit) │ fp8 (8bit) │ Speedup │
# ├─────────────┼──────────────┼─────────────┼────────────┤
# │ 2B-FP8 │ ~18-20 users │ ~35-40 users│ 2.0x │
# │ 2B-BF16 │ ~16-18 users │ ~35-40 users│ 2.2x │
# │ 8B-AWQ4 │ ~14-16 users │ ~25-30 users│ 1.9x │
# │ 8B-AWQ8 │ ~12-14 users │ ~20-25 users│ 1.8x │
# │ 8B-BF16 │ ~8-10 users │ ~15-20 users│ 2.0x │
# └─────────────┴──────────────┴─────────────┴────────────┘
# Accuracy impact: <0.1% drop with fp8 cache (negligible)
# Recommendation: ALWAYS use fp8 for production
# Temperature Impact:
# 0.0 = Fully deterministic (can cause loops)
# 0.1 = Near-deterministic, robust (RECOMMENDED for HTR)
# 0.3 = Slight variation
# 0.7+ = Too creative for transcription
# Stop Sequences Impact:
# WITH newline stops: 97.10% char accuracy (paragraphs merged)
# WITHOUT newline stops: 99.17% char accuracy (preserves formatting)