pen2post/Server/main.py

"""
HTR API - Simplified Local vLLM Version
Reads configuration from environment variables (.env file)
"""

import os
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import httpx
import base64

# ===========================================
# Configuration from Environment
# ===========================================
VLLM_ENDPOINT = os.getenv("VLLM_ENDPOINT", "http://127.0.0.1:8001/v1/chat/completions")
VLLM_MODEL_NAME = os.getenv("VLLM_MODEL_NAME")
MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", 10 * 1024 * 1024))

# Fail fast if critical env vars aren't set
if not VLLM_MODEL_NAME:
    raise RuntimeError("VLLM_MODEL_NAME not set! Check .env file and htr-api.service EnvironmentFile path.")

# ===========================================
# HTR Prompt
# ===========================================
HTR_PROMPT = """You are a handwriting transcription assistant.

Your task is to transcribe the handwritten pages into plain text paragraphs, preserving the original wording exactly.

Rules:
- Do not change, correct, or improve any spelling, grammar, punctuation, or sentence structure.
- Preserve paragraph breaks as they appear in the handwriting. Start a new paragraph only where the handwriting clearly starts a new paragraph that has a blank line. Do NOT insert line breaks on every sentence because the sentence is not fitting in the journal image. Just the paragraph break with the clear blank line between paragraphs.
- Also, insert 2 paragraphs breaks to clearly indicate a new paragraph.
- If you are unsure of a word or cannot read it, write ??? in its place.
- Do not add, remove, or rearrange any sentences.

Only output the transcribed text following these rules."""

# ===========================================
# FastAPI App
# ===========================================
app = FastAPI(
    title="HTR API",
    description="Handwriting Text Recognition API for Prometheus Cafe",
    version="2.0.0"
)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# ===========================================
# Request/Response Models
# ===========================================
class ExtractRequest(BaseModel):
    image: str  # Base64 encoded image

class ExtractResponse(BaseModel):
    text: str

# ===========================================
# Endpoints
# ===========================================
@app.get("/api/health")
async def health_check():
    """Health check endpoint"""
    return {
        "status": "healthy",
        "backend": "local-vllm",
        "model": VLLM_MODEL_NAME,
        "endpoint": VLLM_ENDPOINT
    }

@app.post("/api/extract", response_model=ExtractResponse)
async def extract_text(request: ExtractRequest):
    """Extract text from handwritten image using local vLLM."""

    # Validate and clean image data
    try:
        image_data = request.image
        if "," in image_data:
            image_data = image_data.split(",")[1]

        decoded = base64.b64decode(image_data)
        if len(decoded) > MAX_IMAGE_SIZE:
            raise HTTPException(
                status_code=400,
                detail=f"Image too large. Max size: {MAX_IMAGE_SIZE // 1024 // 1024}MB"
            )
    except Exception as e:
        raise HTTPException(status_code=400, detail=f"Invalid image data: {str(e)}")

    # Call local vLLM
    try:
        async with httpx.AsyncClient(timeout=180.0) as client:
            response = await client.post(
                VLLM_ENDPOINT,
                headers={"Content-Type": "application/json"},
                json={
                    "model": VLLM_MODEL_NAME,
                    "messages": [
                        {
                            "role": "user",
                            "content": [
                                {"type": "text", "text": HTR_PROMPT},
                                {
                                    "type": "image_url",
                                    "image_url": {
                                        "url": f"data:image/jpeg;base64,{image_data}"
                                    }
                                }
                            ]
                        }
                    ],
                    "max_tokens": 2048,
                    "temperature": 0.1,
                }
            )

            if response.status_code != 200:
                raise HTTPException(
                    status_code=500,
                    detail=f"vLLM error {response.status_code}: {response.text}"
                )

            result = response.json()
            extracted_text = result["choices"][0]["message"]["content"]

            return ExtractResponse(text=extracted_text)

    except httpx.TimeoutException:
        raise HTTPException(status_code=504, detail="vLLM request timed out")
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Extraction failed: {str(e)}")