Initial PoC commit
This commit is contained in:
135
Server/.env
Normal file
135
Server/.env
Normal file
@@ -0,0 +1,135 @@
|
||||
# ===========================================
|
||||
# HTR API Configuration - Enhanced Version
|
||||
# ===========================================
|
||||
|
||||
# --- Model Selection ---
|
||||
# Change these when switching models:
|
||||
VLLM_MODEL_PATH=/llm/models/Qwen3-VL-8B-AWQ4
|
||||
VLLM_MODEL_NAME=Qwen3-VL-8B-AWQ4
|
||||
|
||||
# --- vLLM Server Settings ---
|
||||
VLLM_HOST=0.0.0.0
|
||||
VLLM_PORT=8001
|
||||
VLLM_GPU_UTIL=0.90
|
||||
|
||||
# Model-specific max context length
|
||||
VLLM_MAX_MODEL_LEN=2560
|
||||
|
||||
# --- PERFORMANCE BOOST: KV Cache Quantization ---
|
||||
# This is the #1 performance optimization!
|
||||
# Options: auto (default), fp8, fp8_e4m3, fp8_e5m2, int8
|
||||
# fp8 = 2x more concurrent users, minimal quality loss (<0.1%)
|
||||
VLLM_KV_CACHE_DTYPE=fp8
|
||||
|
||||
# --- Sampling Parameters (Override model defaults!) ---
|
||||
# CRITICAL: These override model config.json defaults
|
||||
# Without these, vLLM uses model-specific defaults which may vary
|
||||
|
||||
# Temperature: 0.0 = deterministic, higher = more creative
|
||||
# For HTR: 0.1 is perfect (consistent but not stuck in loops)
|
||||
SAMPLING_TEMPERATURE=0.1
|
||||
|
||||
# Max tokens to generate per request
|
||||
SAMPLING_MAX_TOKENS=500
|
||||
|
||||
# Top-p sampling (0.0-1.0) - use top 95% of probability mass
|
||||
SAMPLING_TOP_P=0.95
|
||||
|
||||
# Top-k sampling (0 = disabled)
|
||||
SAMPLING_TOP_K=0
|
||||
|
||||
# Presence penalty (0.0 = none, positive = encourage diversity)
|
||||
SAMPLING_PRESENCE_PENALTY=0.0
|
||||
|
||||
# Frequency penalty (0.0 = none, positive = reduce repetition)
|
||||
SAMPLING_FREQUENCY_PENALTY=0.0
|
||||
|
||||
# --- CRITICAL: Stop Sequences ---
|
||||
# This is WHY your 2B strips newlines in vLLM but not LM Studio!
|
||||
# Model config.json may have default stop tokens like ["\n", "\n\n"]
|
||||
# Empty = Override model defaults, preserve ALL newlines
|
||||
SAMPLING_STOP_SEQUENCES=
|
||||
|
||||
# Alternative: Use only model's special tokens (if needed)
|
||||
# SAMPLING_STOP_SEQUENCES=<|endoftext|>,<|im_end|>
|
||||
|
||||
# --- FastAPI Settings ---
|
||||
VLLM_ENDPOINT=http://127.0.0.1:8001/v1/chat/completions
|
||||
|
||||
# --- Database ---
|
||||
DATABASE_PATH=/home/fenix/htr-api/htr_usage.db
|
||||
|
||||
# --- Limits ---
|
||||
MAX_IMAGE_SIZE=10485760
|
||||
|
||||
# --- OpenRouter (for future cloud fallback) ---
|
||||
# OPENROUTER_API_KEY=your_key_here
|
||||
# OPENROUTER_ENDPOINT=https://openrouter.ai/api/v1/chat/completions
|
||||
# OPENROUTER_MODEL=qwen/qwen3-vl-8b-instruct
|
||||
|
||||
# ===========================================
|
||||
# Model-Specific Presets (Copy to active settings above)
|
||||
# ===========================================
|
||||
|
||||
# --- Qwen3-VL-2B-FP8 (Fastest: 161 tok/s) ---
|
||||
# VLLM_MODEL_PATH=/llm/models/Qwen3-VL-2B-FP8
|
||||
# VLLM_MODEL_NAME=Qwen3-VL-2B-FP8
|
||||
# VLLM_MAX_MODEL_LEN=8192
|
||||
# VLLM_KV_CACHE_DTYPE=fp8
|
||||
# Model size: 3.5GB | Concurrent: ~35-40 users with fp8 cache
|
||||
|
||||
# --- Qwen3-VL-2B-BF16 (Fast: 121.8 tok/s) ---
|
||||
# VLLM_MODEL_PATH=/llm/models/Qwen3-VL-2B-BF16
|
||||
# VLLM_MODEL_NAME=Qwen3-VL-2B-BF16
|
||||
# VLLM_MAX_MODEL_LEN=8192
|
||||
# VLLM_KV_CACHE_DTYPE=fp8
|
||||
# Model size: 4.3GB | Concurrent: ~35-40 users with fp8 cache
|
||||
|
||||
# --- Qwen3-VL-8B-AWQ4 (Balanced: 88 tok/s, 99.17% accuracy) ---
|
||||
# VLLM_MODEL_PATH=/llm/models/qwen3-vl-8b-awq4
|
||||
# VLLM_MODEL_NAME=qwen3-vl-8b-awq4
|
||||
# VLLM_MAX_MODEL_LEN=8192
|
||||
# VLLM_KV_CACHE_DTYPE=fp8
|
||||
# Model size: 7.5GB | Concurrent: ~25-30 users with fp8 cache
|
||||
|
||||
# --- Qwen3-VL-8B-AWQ8 (Good: 61 tok/s) ---
|
||||
# VLLM_MODEL_PATH=/llm/models/Qwen3-VL-8B-AWQ-8bit
|
||||
# VLLM_MODEL_NAME=Qwen3-VL-8B-AWQ-8bit
|
||||
# VLLM_MAX_MODEL_LEN=8192
|
||||
# VLLM_KV_CACHE_DTYPE=fp8
|
||||
# Model size: 10.8GB | Concurrent: ~20-25 users with fp8 cache
|
||||
|
||||
# --- Qwen3-VL-8B-BF16 (Highest quality: 42.1 tok/s) ---
|
||||
# VLLM_MODEL_PATH=/llm/models/qwen3-vl-8b-bf16
|
||||
# VLLM_MODEL_NAME=qwen3-vl-8b-bf16
|
||||
# VLLM_MAX_MODEL_LEN=8192
|
||||
# VLLM_KV_CACHE_DTYPE=fp8
|
||||
# Model size: 17.5GB | Concurrent: ~15-20 users with fp8 cache
|
||||
|
||||
# ===========================================
|
||||
# Performance Impact Summary
|
||||
# ===========================================
|
||||
|
||||
# KV_CACHE_DTYPE Impact on Concurrent Capacity (A6000 48GB):
|
||||
# ┌─────────────┬──────────────┬─────────────┬────────────┐
|
||||
# │ Model │ auto (16bit) │ fp8 (8bit) │ Speedup │
|
||||
# ├─────────────┼──────────────┼─────────────┼────────────┤
|
||||
# │ 2B-FP8 │ ~18-20 users │ ~35-40 users│ 2.0x │
|
||||
# │ 2B-BF16 │ ~16-18 users │ ~35-40 users│ 2.2x │
|
||||
# │ 8B-AWQ4 │ ~14-16 users │ ~25-30 users│ 1.9x │
|
||||
# │ 8B-AWQ8 │ ~12-14 users │ ~20-25 users│ 1.8x │
|
||||
# │ 8B-BF16 │ ~8-10 users │ ~15-20 users│ 2.0x │
|
||||
# └─────────────┴──────────────┴─────────────┴────────────┘
|
||||
|
||||
# Accuracy impact: <0.1% drop with fp8 cache (negligible)
|
||||
# Recommendation: ALWAYS use fp8 for production
|
||||
|
||||
# Temperature Impact:
|
||||
# 0.0 = Fully deterministic (can cause loops)
|
||||
# 0.1 = Near-deterministic, robust (RECOMMENDED for HTR)
|
||||
# 0.3 = Slight variation
|
||||
# 0.7+ = Too creative for transcription
|
||||
|
||||
# Stop Sequences Impact:
|
||||
# WITH newline stops: 97.10% char accuracy (paragraphs merged)
|
||||
# WITHOUT newline stops: 99.17% char accuracy (preserves formatting)
|
||||
36
Server/config.py
Normal file
36
Server/config.py
Normal file
@@ -0,0 +1,36 @@
|
||||
"""
|
||||
Configuration settings for HTR API on Phoenix
|
||||
"""
|
||||
|
||||
from pydantic_settings import BaseSettings
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
jwt_secret: str = "JWT_SECRET=73936f5c69eb84f013a531a35ffae040855cd6c7891ed1bb0872780fe8c56274"
|
||||
jwt_algorithm: str = "HS256"
|
||||
|
||||
# vLLM Configuration - matches your local setup
|
||||
vllm_endpoint: str = "http://127.0.0.1:8001/v1/chat/completions"
|
||||
vllm_model: str = "qwen3-vl" # served-model-name from vLLM
|
||||
|
||||
# LLM Configuration (OpenRouter or local vLLM)
|
||||
llm_endpoint: str = "http://127.0.0.1:8001/v1/chat/completions"
|
||||
llm_model: str = "qwen3-vl"
|
||||
openrouter_api_key: Optional[str] = None
|
||||
|
||||
# Database
|
||||
database_path: str = "/home/fenix/htr-api/htr_usage.db"
|
||||
|
||||
# Limits
|
||||
max_image_size: int = 10 * 1024 * 1024
|
||||
|
||||
# WordPress
|
||||
upgrade_url: str = "https://prometheuscafe.com/plans"
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
env_file_encoding = "utf-8"
|
||||
|
||||
|
||||
settings = Settings()
|
||||
129
Server/database.py
Normal file
129
Server/database.py
Normal file
@@ -0,0 +1,129 @@
|
||||
"""
|
||||
SQLite database for tracking daily usage
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from config import settings
|
||||
import threading
|
||||
|
||||
_local = threading.local()
|
||||
|
||||
|
||||
def get_db():
|
||||
"""Get thread-local database connection"""
|
||||
if not hasattr(_local, "connection"):
|
||||
_local.connection = sqlite3.connect(
|
||||
settings.database_path,
|
||||
check_same_thread=False
|
||||
)
|
||||
_local.connection.row_factory = sqlite3.Row
|
||||
return _local.connection
|
||||
|
||||
|
||||
def init_db():
|
||||
"""Initialize database schema"""
|
||||
conn = get_db()
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS usage (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
user_id TEXT NOT NULL,
|
||||
date TEXT NOT NULL,
|
||||
count INTEGER DEFAULT 0,
|
||||
created_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
UNIQUE(user_id, date)
|
||||
)
|
||||
""")
|
||||
|
||||
cursor.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_usage_user_date
|
||||
ON usage(user_id, date)
|
||||
""")
|
||||
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS extraction_log (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
user_id TEXT NOT NULL,
|
||||
tier TEXT,
|
||||
timestamp TEXT DEFAULT CURRENT_TIMESTAMP,
|
||||
success INTEGER DEFAULT 1,
|
||||
processing_time_ms INTEGER
|
||||
)
|
||||
""")
|
||||
|
||||
conn.commit()
|
||||
cleanup_old_records()
|
||||
|
||||
|
||||
def get_today_utc() -> str:
|
||||
"""Get today's date in UTC as string"""
|
||||
return datetime.now(timezone.utc).strftime("%Y-%m-%d")
|
||||
|
||||
|
||||
def get_usage_today(user_id: str) -> int:
|
||||
"""Get number of extractions used today for a user"""
|
||||
conn = get_db()
|
||||
cursor = conn.cursor()
|
||||
|
||||
today = get_today_utc()
|
||||
cursor.execute(
|
||||
"SELECT count FROM usage WHERE user_id = ? AND date = ?",
|
||||
(user_id, today)
|
||||
)
|
||||
|
||||
row = cursor.fetchone()
|
||||
return row["count"] if row else 0
|
||||
|
||||
|
||||
def increment_usage(user_id: str) -> int:
|
||||
"""Increment usage count for today. Returns new count."""
|
||||
conn = get_db()
|
||||
cursor = conn.cursor()
|
||||
|
||||
today = get_today_utc()
|
||||
now = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
cursor.execute("""
|
||||
INSERT INTO usage (user_id, date, count, updated_at)
|
||||
VALUES (?, ?, 1, ?)
|
||||
ON CONFLICT(user_id, date)
|
||||
DO UPDATE SET
|
||||
count = count + 1,
|
||||
updated_at = ?
|
||||
""", (user_id, today, now, now))
|
||||
|
||||
conn.commit()
|
||||
return get_usage_today(user_id)
|
||||
|
||||
|
||||
def cleanup_old_records(days_to_keep: int = 30):
|
||||
"""Remove usage records older than specified days"""
|
||||
conn = get_db()
|
||||
cursor = conn.cursor()
|
||||
|
||||
cutoff = (datetime.now(timezone.utc) - timedelta(days=days_to_keep)).strftime("%Y-%m-%d")
|
||||
|
||||
cursor.execute("DELETE FROM usage WHERE date < ?", (cutoff,))
|
||||
cursor.execute("DELETE FROM extraction_log WHERE date(timestamp) < ?", (cutoff,))
|
||||
|
||||
conn.commit()
|
||||
|
||||
|
||||
def get_usage_stats(user_id: str, days: int = 7) -> list:
|
||||
"""Get usage history for a user"""
|
||||
conn = get_db()
|
||||
cursor = conn.cursor()
|
||||
|
||||
cutoff = (datetime.now(timezone.utc) - timedelta(days=days)).strftime("%Y-%m-%d")
|
||||
|
||||
cursor.execute("""
|
||||
SELECT date, count
|
||||
FROM usage
|
||||
WHERE user_id = ? AND date >= ?
|
||||
ORDER BY date DESC
|
||||
""", (user_id, cutoff))
|
||||
|
||||
return [dict(row) for row in cursor.fetchall()]
|
||||
29
Server/htr-api.service
Normal file
29
Server/htr-api.service
Normal file
@@ -0,0 +1,29 @@
|
||||
# ===========================================
|
||||
# HTR FastAPI Service
|
||||
# Reads configuration from /home/fenix/htr-api/.env
|
||||
# ===========================================
|
||||
# BACKUP COPY - After editing, run:
|
||||
# sudo cp ~/htr-api/htr-api.service /etc/systemd/system/
|
||||
# sudo systemctl daemon-reload && sudo systemctl restart htr-api
|
||||
# ===========================================
|
||||
|
||||
[Unit]
|
||||
Description=HTR FastAPI Service
|
||||
After=network.target vllm-htr.service
|
||||
Wants=vllm-htr.service
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=fenix
|
||||
WorkingDirectory=/home/fenix/htr-api
|
||||
|
||||
# Load environment variables from .env
|
||||
EnvironmentFile=/home/fenix/htr-api/.env
|
||||
|
||||
ExecStart=/home/fenix/htr-api/venv/bin/uvicorn main:app --host 0.0.0.0 --port 8000
|
||||
|
||||
Restart=on-failure
|
||||
RestartSec=5
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
139
Server/main.py
Normal file
139
Server/main.py
Normal file
@@ -0,0 +1,139 @@
|
||||
"""
|
||||
HTR API - Simplified Local vLLM Version
|
||||
Reads configuration from environment variables (.env file)
|
||||
"""
|
||||
|
||||
import os
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from pydantic import BaseModel
|
||||
import httpx
|
||||
import base64
|
||||
|
||||
# ===========================================
|
||||
# Configuration from Environment
|
||||
# ===========================================
|
||||
VLLM_ENDPOINT = os.getenv("VLLM_ENDPOINT", "http://127.0.0.1:8001/v1/chat/completions")
|
||||
VLLM_MODEL_NAME = os.getenv("VLLM_MODEL_NAME")
|
||||
MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", 10 * 1024 * 1024))
|
||||
|
||||
# Fail fast if critical env vars aren't set
|
||||
if not VLLM_MODEL_NAME:
|
||||
raise RuntimeError("VLLM_MODEL_NAME not set! Check .env file and htr-api.service EnvironmentFile path.")
|
||||
|
||||
# ===========================================
|
||||
# HTR Prompt
|
||||
# ===========================================
|
||||
HTR_PROMPT = """You are a handwriting transcription assistant.
|
||||
|
||||
Your task is to transcribe the handwritten pages into plain text paragraphs, preserving the original wording exactly.
|
||||
|
||||
Rules:
|
||||
- Do not change, correct, or improve any spelling, grammar, punctuation, or sentence structure.
|
||||
- Preserve paragraph breaks as they appear in the handwriting. Start a new paragraph only where the handwriting clearly starts a new paragraph that has a blank line. Do NOT insert line breaks on every sentence because the sentence is not fitting in the journal image. Just the paragraph break with the clear blank line between paragraphs.
|
||||
- Also, insert 2 paragraphs breaks to clearly indicate a new paragraph.
|
||||
- If you are unsure of a word or cannot read it, write ??? in its place.
|
||||
- Do not add, remove, or rearrange any sentences.
|
||||
|
||||
Only output the transcribed text following these rules."""
|
||||
|
||||
# ===========================================
|
||||
# FastAPI App
|
||||
# ===========================================
|
||||
app = FastAPI(
|
||||
title="HTR API",
|
||||
description="Handwriting Text Recognition API for Prometheus Cafe",
|
||||
version="2.0.0"
|
||||
)
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# ===========================================
|
||||
# Request/Response Models
|
||||
# ===========================================
|
||||
class ExtractRequest(BaseModel):
|
||||
image: str # Base64 encoded image
|
||||
|
||||
class ExtractResponse(BaseModel):
|
||||
text: str
|
||||
|
||||
# ===========================================
|
||||
# Endpoints
|
||||
# ===========================================
|
||||
@app.get("/api/health")
|
||||
async def health_check():
|
||||
"""Health check endpoint"""
|
||||
return {
|
||||
"status": "healthy",
|
||||
"backend": "local-vllm",
|
||||
"model": VLLM_MODEL_NAME,
|
||||
"endpoint": VLLM_ENDPOINT
|
||||
}
|
||||
|
||||
@app.post("/api/extract", response_model=ExtractResponse)
|
||||
async def extract_text(request: ExtractRequest):
|
||||
"""Extract text from handwritten image using local vLLM."""
|
||||
|
||||
# Validate and clean image data
|
||||
try:
|
||||
image_data = request.image
|
||||
if "," in image_data:
|
||||
image_data = image_data.split(",")[1]
|
||||
|
||||
decoded = base64.b64decode(image_data)
|
||||
if len(decoded) > MAX_IMAGE_SIZE:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Image too large. Max size: {MAX_IMAGE_SIZE // 1024 // 1024}MB"
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=f"Invalid image data: {str(e)}")
|
||||
|
||||
# Call local vLLM
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=180.0) as client:
|
||||
response = await client.post(
|
||||
VLLM_ENDPOINT,
|
||||
headers={"Content-Type": "application/json"},
|
||||
json={
|
||||
"model": VLLM_MODEL_NAME,
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": HTR_PROMPT},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{image_data}"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"max_tokens": 2048,
|
||||
"temperature": 0.1,
|
||||
}
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=f"vLLM error {response.status_code}: {response.text}"
|
||||
)
|
||||
|
||||
result = response.json()
|
||||
extracted_text = result["choices"][0]["message"]["content"]
|
||||
|
||||
return ExtractResponse(text=extracted_text)
|
||||
|
||||
except httpx.TimeoutException:
|
||||
raise HTTPException(status_code=504, detail="vLLM request timed out")
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Extraction failed: {str(e)}")
|
||||
36
Server/vllm-htr.service
Normal file
36
Server/vllm-htr.service
Normal file
@@ -0,0 +1,36 @@
|
||||
# ===========================================
|
||||
# vLLM HTR Service
|
||||
# Reads configuration from /home/fenix/htr-api/.env
|
||||
# ===========================================
|
||||
# BACKUP COPY - After editing, run:
|
||||
# sudo cp ~/htr-api/vllm-htr.service /etc/systemd/system/
|
||||
# sudo systemctl daemon-reload && sudo systemctl restart vllm-htr
|
||||
# ===========================================
|
||||
|
||||
[Unit]
|
||||
Description=vLLM Server for HTR
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
User=fenix
|
||||
WorkingDirectory=/llm
|
||||
|
||||
# Load environment variables from .env
|
||||
EnvironmentFile=/home/fenix/htr-api/.env
|
||||
|
||||
# Use environment variables in ExecStart
|
||||
ExecStart=/llm/env/bin/vllm serve ${VLLM_MODEL_PATH} \
|
||||
--host ${VLLM_HOST} \
|
||||
--port ${VLLM_PORT} \
|
||||
--max-model-len ${VLLM_MAX_MODEL_LEN} \
|
||||
--gpu-memory-utilization ${VLLM_GPU_UTIL} \
|
||||
--trust-remote-code \
|
||||
--served-model-name ${VLLM_MODEL_NAME}
|
||||
|
||||
Restart=on-failure
|
||||
RestartSec=10
|
||||
TimeoutStartSec=300
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
Reference in New Issue
Block a user