> For the complete documentation index, see [llms.txt](https://docs.vntranslator.com/llms.txt). Markdown versions of documentation pages are available by appending `.md` to page URLs; this page is available as [Markdown](https://docs.vntranslator.com/advanced/ocr-server-kit/suryaocr.md).

# SuryaOCR

{% code title="vntocr\_suryaocr.py" %}

```python
# Integration VNTranslator OCR with SuryaOCR engine
# Version: 1.0
# Author: Fazx - GarudaMods | https://www.patreon.com/vntranslator

"""
# ==================================================================
# Surya OCR: https://github.com/VikParuchuri/surya
# Required: python 3.10+ and PyTorch
# Install with: pip install surya-ocr
# ==================================================================
# Run this script with: python vntocr_suryaocr.py
# In VNTranslator use Custom Engine - HTTP POST with configuration:
# -- URL: http://127.0.0.1:5353
# -- Content type: application/json
# -- Headers: {}
# -- Body: {"image":"$IMAGE_BASE64", "langs": ["ja"]}
# -- Response type: JSON
# -- Response query: fullText
# ==================================================================
# Languages (two-letter ISO) https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes
# -- Japanese = ja
# -- English = en
# ==================================================================
"""

from flask import Flask, request, jsonify
from PIL import Image
from io import BytesIO
import base64
import re
import json
from surya.ocr import run_ocr
from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor
from surya.model.recognition.model import load_model as load_rec_model
from surya.model.recognition.processor import load_processor as load_rec_processor

APP_HOST = "localhost"
APP_PORT = 5353
APP_DEBUG = True

def format_ocr_result(ocr_result):
    full_text = ""
    boxes = []

    for result in ocr_result:
        for box in result["text_boxes"]:
            bbox = box["bbox"]
            x, y = bbox[0], bbox[1]
            w, h = bbox[2] - bbox[0], bbox[3] - bbox[1]
            boxes.append({
                "text": box["text"],
                "w": w,
                "h": h,
                "x": x,
                "y": y
            })

    boxes.sort(key=lambda box: (box["y"], box["x"]))
    full_text = " ".join(box["text"] for box in boxes).strip()

    return {
        "fullText": full_text,
        "boxes": boxes
    }

def parse_ocr_result(ocr_result):
    if not isinstance(ocr_result, list):
        raise ValueError("ocr_result is not a list")

    parsed_results = []
    for result in ocr_result:
        text_lines = []
        for line in result.text_lines:
            text_lines.append({
                "polygon": line.polygon,
                "confidence": line.confidence,
                "text": line.text,
                "bbox": line.bbox
            })

        parsed_results.append({
            "text_boxes": text_lines,
            "languages": result.languages,
            "image_bbox": result.image_bbox
        })

    return format_ocr_result(parsed_results)

############################################################

app = Flask(__name__)
det_processor = load_det_processor()
det_model = load_det_model()
rec_model = load_rec_model()
rec_processor = load_rec_processor()

@app.route("/", methods=["POST"])
def ocr_endpoint():    

    try:
        print("\n\n=== OCR Request ===")
        print(f"Method: {request.method}")
        print(f"Headers: {dict(request.headers)}")
        
        if not request.is_json:
            print("Request is not JSON")
            return jsonify({"error": "Request must be JSON"}), 400
        
        data = request.get_json()

        # log payload
        print(f"Request JSON keys: {list(data.keys())}")

        # check image
        if "image" not in data:
            print("No image data")
            return jsonify({"error": "No image data"}), 400
        
        # decode base64 image
        try:
            image_decode = base64.b64decode(data["image"])
            image = Image.open(BytesIO(image_decode))
            print("Image successfully decoded from Base64")
        except Exception as e:
            print(f"Image decoding failed: {e}")
            return jsonify({"error": f"Image decoding failed: {str(e)}"}), 400

        # check langs
        langs = data.get("langs", ["ja"])
        print(f"langs: {langs}")

        # check draw bounding box
        draw_bounding_box = data.get("draw_bounding_box", False)
        print(f"draw_bounding_box: {draw_bounding_box}")

        # run ocr
        # https://github.com/VikParuchuri/surya?tab=readme-ov-file#from-python
        result = run_ocr([image], [langs], det_model, det_processor, rec_model, rec_processor)
        print(f"OCR completed successfully: {result}")
        
        """
        [OCRResult(
            text_lines=[
                TextLine(polygon=[[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]], confidence=0.0, text='String', bbox=[0.0, 0.0, 0.0, 0.0]),
                TextLine(polygon=[[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]], confidence=0.0, text='String', bbox=[0.0, 0.0, 0.0, 0.0]),
                TextLine(polygon=[[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]], confidence=0.0, text='String', bbox=[0.0, 0.0, 0.0, 0.0])
            ], 
            languages=['ja'], image_bbox=[0.0, 0.0, 0.0, 0.0]
        )]
        """

        # parse result
        parsed_result = parse_ocr_result(result)       
        parsed_result['draw_bounding_box'] = draw_bounding_box
        json_result = json.dumps(parsed_result, indent=4, ensure_ascii=False)
        return json_result

    except Exception as e:
        print(f"Error request: {e}")
        return jsonify({"error": str(e)}), 500

if __name__ == "__main__":
    print(f"=== Starting OCR server {APP_HOST} on port {APP_PORT} ===")
    app.run(debug=APP_DEBUG, host=APP_HOST, port=APP_PORT)

```

{% endcode %}


---

# Agent Instructions
This documentation is published with GitBook. GitBook is the documentation platform designed so that both humans and AI agents can read, navigate, and reason over technical content effectively. Learn more at gitbook.com.

## Querying This Documentation
If you need additional information that is not directly available in this page, you can query the documentation dynamically by asking a question.

Perform an HTTP GET request on the current page URL with the `ask` query parameter, and the optional `goal` query parameter:

```
GET https://docs.vntranslator.com/advanced/ocr-server-kit/suryaocr.md?ask=<question>&goal=<endgoal>
```

`ask` is the immediate question: it should be specific, self-contained, and written in natural language.
`goal` is optional and describes the broader end goal you are ultimately trying to accomplish on behalf of the user. GitBook uses it to tailor the answer towards what is most useful for that goal.

The response will contain a direct answer to the question and relevant excerpts and sources from the documentation.

Use this mechanism when the answer is not explicitly present in the current page, you need clarification or additional context, or you want to retrieve related documentation sections.
