# SuryaOCR

{% code title="vntocr\_suryaocr.py" %}

```python
# Integration VNTranslator OCR with SuryaOCR engine
# Version: 1.0
# Author: Fazx - GarudaMods | https://www.patreon.com/vntranslator

"""
# ==================================================================
# Surya OCR: https://github.com/VikParuchuri/surya
# Required: python 3.10+ and PyTorch
# Install with: pip install surya-ocr
# ==================================================================
# Run this script with: python vntocr_suryaocr.py
# In VNTranslator use Custom Engine - HTTP POST with configuration:
# -- URL: http://127.0.0.1:5353
# -- Content type: application/json
# -- Headers: {}
# -- Body: {"image":"$IMAGE_BASE64", "langs": ["ja"]}
# -- Response type: JSON
# -- Response query: fullText
# ==================================================================
# Languages (two-letter ISO) https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes
# -- Japanese = ja
# -- English = en
# ==================================================================
"""

from flask import Flask, request, jsonify
from PIL import Image
from io import BytesIO
import base64
import re
import json
from surya.ocr import run_ocr
from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
from surya.model.recognition.model import load_model as load_rec_model
from surya.model.recognition.processor import load_processor as load_rec_processor

APP_HOST = "localhost"
APP_PORT = 5353
APP_DEBUG = True

def format_ocr_result(ocr_result):
    full_text = ""
    boxes = []

    for result in ocr_result:
        for box in result["text_boxes"]:
            bbox = box["bbox"]
            x, y = bbox[0], bbox[1]
            w, h = bbox[2] - bbox[0], bbox[3] - bbox[1]
            boxes.append({
                "text": box["text"],
                "w": w,
                "h": h,
                "x": x,
                "y": y
            })

    boxes.sort(key=lambda box: (box["y"], box["x"]))
    full_text = " ".join(box["text"] for box in boxes).strip()

    return {
        "fullText": full_text,
        "boxes": boxes
    }

def parse_ocr_result(ocr_result):
    if not isinstance(ocr_result, list):
        raise ValueError("ocr_result is not a list")

    parsed_results = []
    for result in ocr_result:
        text_lines = []
        for line in result.text_lines:
            text_lines.append({
                "polygon": line.polygon,
                "confidence": line.confidence,
                "text": line.text,
                "bbox": line.bbox
            })

        parsed_results.append({
            "text_boxes": text_lines,
            "languages": result.languages,
            "image_bbox": result.image_bbox
        })

    return format_ocr_result(parsed_results)

############################################################

app = Flask(__name__)
det_processor = load_det_processor()
det_model = load_det_model()
rec_model = load_rec_model()
rec_processor = load_rec_processor()

@app.route("/", methods=["POST"])
def ocr_endpoint():    

    try:
        print("\n\n=== OCR Request ===")
        print(f"Method: {request.method}")
        print(f"Headers: {dict(request.headers)}")
        
        if not request.is_json:
            print("Request is not JSON")
            return jsonify({"error": "Request must be JSON"}), 400
        
        data = request.get_json()

        # log payload
        print(f"Request JSON keys: {list(data.keys())}")

        # check image
        if "image" not in data:
            print("No image data")
            return jsonify({"error": "No image data"}), 400
        
        # decode base64 image
        try:
            image_decode = base64.b64decode(data["image"])
            image = Image.open(BytesIO(image_decode))
            print("Image successfully decoded from Base64")
        except Exception as e:
            print(f"Image decoding failed: {e}")
            return jsonify({"error": f"Image decoding failed: {str(e)}"}), 400

        # check langs
        langs = data.get("langs", ["ja"])
        print(f"langs: {langs}")

        # check draw bounding box
        draw_bounding_box = data.get("draw_bounding_box", False)
        print(f"draw_bounding_box: {draw_bounding_box}")

        # run ocr
        # https://github.com/VikParuchuri/surya?tab=readme-ov-file#from-python
        result = run_ocr([image], [langs], det_model, det_processor, rec_model, rec_processor)
        print(f"OCR completed successfully: {result}")
        
        """
        [OCRResult(
            text_lines=[
                TextLine(polygon=[[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]], confidence=0.0, text='String', bbox=[0.0, 0.0, 0.0, 0.0]),
                TextLine(polygon=[[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]], confidence=0.0, text='String', bbox=[0.0, 0.0, 0.0, 0.0]),
                TextLine(polygon=[[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]], confidence=0.0, text='String', bbox=[0.0, 0.0, 0.0, 0.0])
            ], 
            languages=['ja'], image_bbox=[0.0, 0.0, 0.0, 0.0]
        )]
        """

        # parse result
        parsed_result = parse_ocr_result(result)       
        parsed_result['draw_bounding_box'] = draw_bounding_box
        json_result = json.dumps(parsed_result, indent=4, ensure_ascii=False)
        return json_result

    except Exception as e:
        print(f"Error request: {e}")
        return jsonify({"error": str(e)}), 500

if __name__ == "__main__":
    print(f"=== Starting OCR server {APP_HOST} on port {APP_PORT} ===")
    app.run(debug=APP_DEBUG, host=APP_HOST, port=APP_PORT)

```

{% endcode %}


---

# Agent Instructions: Querying This Documentation

If you need additional information that is not directly available in this page, you can query the documentation dynamically by asking a question.

Perform an HTTP GET request on the current page URL with the `ask` query parameter:

```
GET https://docs.vntranslator.com/advanced/ocr-server-kit/suryaocr.md?ask=<question>
```

The question should be specific, self-contained, and written in natural language.
The response will contain a direct answer to the question and relevant excerpts and sources from the documentation.

Use this mechanism when the answer is not explicitly present in the current page, you need clarification or additional context, or you want to retrieve related documentation sections.
