Initial PoC commit

This commit is contained in:
Fenix
2026-01-28 20:39:59 -08:00
commit 94811ca7c1
18 changed files with 1831 additions and 0 deletions

135
Server/.env Normal file
View File

@@ -0,0 +1,135 @@
# ===========================================
# HTR API Configuration - Enhanced Version
# ===========================================
# --- Model Selection ---
# Change these when switching models:
VLLM_MODEL_PATH=/llm/models/Qwen3-VL-8B-AWQ4
VLLM_MODEL_NAME=Qwen3-VL-8B-AWQ4
# --- vLLM Server Settings ---
VLLM_HOST=0.0.0.0
VLLM_PORT=8001
VLLM_GPU_UTIL=0.90
# Model-specific max context length
VLLM_MAX_MODEL_LEN=2560
# --- PERFORMANCE BOOST: KV Cache Quantization ---
# This is the #1 performance optimization!
# Options: auto (default), fp8, fp8_e4m3, fp8_e5m2, int8
# fp8 = 2x more concurrent users, minimal quality loss (<0.1%)
VLLM_KV_CACHE_DTYPE=fp8
# --- Sampling Parameters (Override model defaults!) ---
# CRITICAL: These override model config.json defaults
# Without these, vLLM uses model-specific defaults which may vary
# Temperature: 0.0 = deterministic, higher = more creative
# For HTR: 0.1 is perfect (consistent but not stuck in loops)
SAMPLING_TEMPERATURE=0.1
# Max tokens to generate per request
SAMPLING_MAX_TOKENS=500
# Top-p sampling (0.0-1.0) - use top 95% of probability mass
SAMPLING_TOP_P=0.95
# Top-k sampling (0 = disabled)
SAMPLING_TOP_K=0
# Presence penalty (0.0 = none, positive = encourage diversity)
SAMPLING_PRESENCE_PENALTY=0.0
# Frequency penalty (0.0 = none, positive = reduce repetition)
SAMPLING_FREQUENCY_PENALTY=0.0
# --- CRITICAL: Stop Sequences ---
# This is WHY your 2B strips newlines in vLLM but not LM Studio!
# Model config.json may have default stop tokens like ["\n", "\n\n"]
# Empty = Override model defaults, preserve ALL newlines
SAMPLING_STOP_SEQUENCES=
# Alternative: Use only model's special tokens (if needed)
# SAMPLING_STOP_SEQUENCES=<|endoftext|>,<|im_end|>
# --- FastAPI Settings ---
VLLM_ENDPOINT=http://127.0.0.1:8001/v1/chat/completions
# --- Database ---
DATABASE_PATH=/home/fenix/htr-api/htr_usage.db
# --- Limits ---
MAX_IMAGE_SIZE=10485760
# --- OpenRouter (for future cloud fallback) ---
# OPENROUTER_API_KEY=your_key_here
# OPENROUTER_ENDPOINT=https://openrouter.ai/api/v1/chat/completions
# OPENROUTER_MODEL=qwen/qwen3-vl-8b-instruct
# ===========================================
# Model-Specific Presets (Copy to active settings above)
# ===========================================
# --- Qwen3-VL-2B-FP8 (Fastest: 161 tok/s) ---
# VLLM_MODEL_PATH=/llm/models/Qwen3-VL-2B-FP8
# VLLM_MODEL_NAME=Qwen3-VL-2B-FP8
# VLLM_MAX_MODEL_LEN=8192
# VLLM_KV_CACHE_DTYPE=fp8
# Model size: 3.5GB | Concurrent: ~35-40 users with fp8 cache
# --- Qwen3-VL-2B-BF16 (Fast: 121.8 tok/s) ---
# VLLM_MODEL_PATH=/llm/models/Qwen3-VL-2B-BF16
# VLLM_MODEL_NAME=Qwen3-VL-2B-BF16
# VLLM_MAX_MODEL_LEN=8192
# VLLM_KV_CACHE_DTYPE=fp8
# Model size: 4.3GB | Concurrent: ~35-40 users with fp8 cache
# --- Qwen3-VL-8B-AWQ4 (Balanced: 88 tok/s, 99.17% accuracy) ---
# VLLM_MODEL_PATH=/llm/models/qwen3-vl-8b-awq4
# VLLM_MODEL_NAME=qwen3-vl-8b-awq4
# VLLM_MAX_MODEL_LEN=8192
# VLLM_KV_CACHE_DTYPE=fp8
# Model size: 7.5GB | Concurrent: ~25-30 users with fp8 cache
# --- Qwen3-VL-8B-AWQ8 (Good: 61 tok/s) ---
# VLLM_MODEL_PATH=/llm/models/Qwen3-VL-8B-AWQ-8bit
# VLLM_MODEL_NAME=Qwen3-VL-8B-AWQ-8bit
# VLLM_MAX_MODEL_LEN=8192
# VLLM_KV_CACHE_DTYPE=fp8
# Model size: 10.8GB | Concurrent: ~20-25 users with fp8 cache
# --- Qwen3-VL-8B-BF16 (Highest quality: 42.1 tok/s) ---
# VLLM_MODEL_PATH=/llm/models/qwen3-vl-8b-bf16
# VLLM_MODEL_NAME=qwen3-vl-8b-bf16
# VLLM_MAX_MODEL_LEN=8192
# VLLM_KV_CACHE_DTYPE=fp8
# Model size: 17.5GB | Concurrent: ~15-20 users with fp8 cache
# ===========================================
# Performance Impact Summary
# ===========================================
# KV_CACHE_DTYPE Impact on Concurrent Capacity (A6000 48GB):
# ┌─────────────┬──────────────┬─────────────┬────────────┐
# │ Model │ auto (16bit) │ fp8 (8bit) │ Speedup │
# ├─────────────┼──────────────┼─────────────┼────────────┤
# │ 2B-FP8 │ ~18-20 users │ ~35-40 users│ 2.0x │
# │ 2B-BF16 │ ~16-18 users │ ~35-40 users│ 2.2x │
# │ 8B-AWQ4 │ ~14-16 users │ ~25-30 users│ 1.9x │
# │ 8B-AWQ8 │ ~12-14 users │ ~20-25 users│ 1.8x │
# │ 8B-BF16 │ ~8-10 users │ ~15-20 users│ 2.0x │
# └─────────────┴──────────────┴─────────────┴────────────┘
# Accuracy impact: <0.1% drop with fp8 cache (negligible)
# Recommendation: ALWAYS use fp8 for production
# Temperature Impact:
# 0.0 = Fully deterministic (can cause loops)
# 0.1 = Near-deterministic, robust (RECOMMENDED for HTR)
# 0.3 = Slight variation
# 0.7+ = Too creative for transcription
# Stop Sequences Impact:
# WITH newline stops: 97.10% char accuracy (paragraphs merged)
# WITHOUT newline stops: 99.17% char accuracy (preserves formatting)

36
Server/config.py Normal file
View File

@@ -0,0 +1,36 @@
"""
Configuration settings for HTR API on Phoenix
"""
from pydantic_settings import BaseSettings
from typing import Optional
class Settings(BaseSettings):
jwt_secret: str = "JWT_SECRET=73936f5c69eb84f013a531a35ffae040855cd6c7891ed1bb0872780fe8c56274"
jwt_algorithm: str = "HS256"
# vLLM Configuration - matches your local setup
vllm_endpoint: str = "http://127.0.0.1:8001/v1/chat/completions"
vllm_model: str = "qwen3-vl" # served-model-name from vLLM
# LLM Configuration (OpenRouter or local vLLM)
llm_endpoint: str = "http://127.0.0.1:8001/v1/chat/completions"
llm_model: str = "qwen3-vl"
openrouter_api_key: Optional[str] = None
# Database
database_path: str = "/home/fenix/htr-api/htr_usage.db"
# Limits
max_image_size: int = 10 * 1024 * 1024
# WordPress
upgrade_url: str = "https://prometheuscafe.com/plans"
class Config:
env_file = ".env"
env_file_encoding = "utf-8"
settings = Settings()

129
Server/database.py Normal file
View File

@@ -0,0 +1,129 @@
"""
SQLite database for tracking daily usage
"""
import sqlite3
from datetime import datetime, timezone, timedelta
from config import settings
import threading
_local = threading.local()
def get_db():
"""Get thread-local database connection"""
if not hasattr(_local, "connection"):
_local.connection = sqlite3.connect(
settings.database_path,
check_same_thread=False
)
_local.connection.row_factory = sqlite3.Row
return _local.connection
def init_db():
"""Initialize database schema"""
conn = get_db()
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS usage (
id INTEGER PRIMARY KEY AUTOINCREMENT,
user_id TEXT NOT NULL,
date TEXT NOT NULL,
count INTEGER DEFAULT 0,
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
updated_at TEXT DEFAULT CURRENT_TIMESTAMP,
UNIQUE(user_id, date)
)
""")
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_usage_user_date
ON usage(user_id, date)
""")
cursor.execute("""
CREATE TABLE IF NOT EXISTS extraction_log (
id INTEGER PRIMARY KEY AUTOINCREMENT,
user_id TEXT NOT NULL,
tier TEXT,
timestamp TEXT DEFAULT CURRENT_TIMESTAMP,
success INTEGER DEFAULT 1,
processing_time_ms INTEGER
)
""")
conn.commit()
cleanup_old_records()
def get_today_utc() -> str:
"""Get today's date in UTC as string"""
return datetime.now(timezone.utc).strftime("%Y-%m-%d")
def get_usage_today(user_id: str) -> int:
"""Get number of extractions used today for a user"""
conn = get_db()
cursor = conn.cursor()
today = get_today_utc()
cursor.execute(
"SELECT count FROM usage WHERE user_id = ? AND date = ?",
(user_id, today)
)
row = cursor.fetchone()
return row["count"] if row else 0
def increment_usage(user_id: str) -> int:
"""Increment usage count for today. Returns new count."""
conn = get_db()
cursor = conn.cursor()
today = get_today_utc()
now = datetime.now(timezone.utc).isoformat()
cursor.execute("""
INSERT INTO usage (user_id, date, count, updated_at)
VALUES (?, ?, 1, ?)
ON CONFLICT(user_id, date)
DO UPDATE SET
count = count + 1,
updated_at = ?
""", (user_id, today, now, now))
conn.commit()
return get_usage_today(user_id)
def cleanup_old_records(days_to_keep: int = 30):
"""Remove usage records older than specified days"""
conn = get_db()
cursor = conn.cursor()
cutoff = (datetime.now(timezone.utc) - timedelta(days=days_to_keep)).strftime("%Y-%m-%d")
cursor.execute("DELETE FROM usage WHERE date < ?", (cutoff,))
cursor.execute("DELETE FROM extraction_log WHERE date(timestamp) < ?", (cutoff,))
conn.commit()
def get_usage_stats(user_id: str, days: int = 7) -> list:
"""Get usage history for a user"""
conn = get_db()
cursor = conn.cursor()
cutoff = (datetime.now(timezone.utc) - timedelta(days=days)).strftime("%Y-%m-%d")
cursor.execute("""
SELECT date, count
FROM usage
WHERE user_id = ? AND date >= ?
ORDER BY date DESC
""", (user_id, cutoff))
return [dict(row) for row in cursor.fetchall()]

29
Server/htr-api.service Normal file
View File

@@ -0,0 +1,29 @@
# ===========================================
# HTR FastAPI Service
# Reads configuration from /home/fenix/htr-api/.env
# ===========================================
# BACKUP COPY - After editing, run:
# sudo cp ~/htr-api/htr-api.service /etc/systemd/system/
# sudo systemctl daemon-reload && sudo systemctl restart htr-api
# ===========================================
[Unit]
Description=HTR FastAPI Service
After=network.target vllm-htr.service
Wants=vllm-htr.service
[Service]
Type=simple
User=fenix
WorkingDirectory=/home/fenix/htr-api
# Load environment variables from .env
EnvironmentFile=/home/fenix/htr-api/.env
ExecStart=/home/fenix/htr-api/venv/bin/uvicorn main:app --host 0.0.0.0 --port 8000
Restart=on-failure
RestartSec=5
[Install]
WantedBy=multi-user.target

139
Server/main.py Normal file
View File

@@ -0,0 +1,139 @@
"""
HTR API - Simplified Local vLLM Version
Reads configuration from environment variables (.env file)
"""
import os
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import httpx
import base64
# ===========================================
# Configuration from Environment
# ===========================================
VLLM_ENDPOINT = os.getenv("VLLM_ENDPOINT", "http://127.0.0.1:8001/v1/chat/completions")
VLLM_MODEL_NAME = os.getenv("VLLM_MODEL_NAME")
MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", 10 * 1024 * 1024))
# Fail fast if critical env vars aren't set
if not VLLM_MODEL_NAME:
raise RuntimeError("VLLM_MODEL_NAME not set! Check .env file and htr-api.service EnvironmentFile path.")
# ===========================================
# HTR Prompt
# ===========================================
HTR_PROMPT = """You are a handwriting transcription assistant.
Your task is to transcribe the handwritten pages into plain text paragraphs, preserving the original wording exactly.
Rules:
- Do not change, correct, or improve any spelling, grammar, punctuation, or sentence structure.
- Preserve paragraph breaks as they appear in the handwriting. Start a new paragraph only where the handwriting clearly starts a new paragraph that has a blank line. Do NOT insert line breaks on every sentence because the sentence is not fitting in the journal image. Just the paragraph break with the clear blank line between paragraphs.
- Also, insert 2 paragraphs breaks to clearly indicate a new paragraph.
- If you are unsure of a word or cannot read it, write ??? in its place.
- Do not add, remove, or rearrange any sentences.
Only output the transcribed text following these rules."""
# ===========================================
# FastAPI App
# ===========================================
app = FastAPI(
title="HTR API",
description="Handwriting Text Recognition API for Prometheus Cafe",
version="2.0.0"
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# ===========================================
# Request/Response Models
# ===========================================
class ExtractRequest(BaseModel):
image: str # Base64 encoded image
class ExtractResponse(BaseModel):
text: str
# ===========================================
# Endpoints
# ===========================================
@app.get("/api/health")
async def health_check():
"""Health check endpoint"""
return {
"status": "healthy",
"backend": "local-vllm",
"model": VLLM_MODEL_NAME,
"endpoint": VLLM_ENDPOINT
}
@app.post("/api/extract", response_model=ExtractResponse)
async def extract_text(request: ExtractRequest):
"""Extract text from handwritten image using local vLLM."""
# Validate and clean image data
try:
image_data = request.image
if "," in image_data:
image_data = image_data.split(",")[1]
decoded = base64.b64decode(image_data)
if len(decoded) > MAX_IMAGE_SIZE:
raise HTTPException(
status_code=400,
detail=f"Image too large. Max size: {MAX_IMAGE_SIZE // 1024 // 1024}MB"
)
except Exception as e:
raise HTTPException(status_code=400, detail=f"Invalid image data: {str(e)}")
# Call local vLLM
try:
async with httpx.AsyncClient(timeout=180.0) as client:
response = await client.post(
VLLM_ENDPOINT,
headers={"Content-Type": "application/json"},
json={
"model": VLLM_MODEL_NAME,
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": HTR_PROMPT},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_data}"
}
}
]
}
],
"max_tokens": 2048,
"temperature": 0.1,
}
)
if response.status_code != 200:
raise HTTPException(
status_code=500,
detail=f"vLLM error {response.status_code}: {response.text}"
)
result = response.json()
extracted_text = result["choices"][0]["message"]["content"]
return ExtractResponse(text=extracted_text)
except httpx.TimeoutException:
raise HTTPException(status_code=504, detail="vLLM request timed out")
except Exception as e:
raise HTTPException(status_code=500, detail=f"Extraction failed: {str(e)}")

36
Server/vllm-htr.service Normal file
View File

@@ -0,0 +1,36 @@
# ===========================================
# vLLM HTR Service
# Reads configuration from /home/fenix/htr-api/.env
# ===========================================
# BACKUP COPY - After editing, run:
# sudo cp ~/htr-api/vllm-htr.service /etc/systemd/system/
# sudo systemctl daemon-reload && sudo systemctl restart vllm-htr
# ===========================================
[Unit]
Description=vLLM Server for HTR
After=network.target
[Service]
Type=simple
User=fenix
WorkingDirectory=/llm
# Load environment variables from .env
EnvironmentFile=/home/fenix/htr-api/.env
# Use environment variables in ExecStart
ExecStart=/llm/env/bin/vllm serve ${VLLM_MODEL_PATH} \
--host ${VLLM_HOST} \
--port ${VLLM_PORT} \
--max-model-len ${VLLM_MAX_MODEL_LEN} \
--gpu-memory-utilization ${VLLM_GPU_UTIL} \
--trust-remote-code \
--served-model-name ${VLLM_MODEL_NAME}
Restart=on-failure
RestartSec=10
TimeoutStartSec=300
[Install]
WantedBy=multi-user.target