From dcca409772efdf7dcac302016f28201cc49228aa Mon Sep 17 00:00:00 2001
From: kingc2022 <99593855+kingc2022@users.noreply.github.com>
Date: Tue, 1 Oct 2024 18:26:31 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20=E5=AE=9E=E7=8E=B0=E5=9F=BA=E4=BA=8EFas?=
=?UTF-8?q?tAPI=E7=9A=84OCR=E6=9C=8D=E5=8A=A1?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
.gitignore | 323 ++++++++++++++++++
.idea/.gitignore | 8 +
.../inspectionProfiles/profiles_settings.xml | 6 +
.idea/misc.xml | 7 +
.idea/modules.xml | 8 +
.idea/ocr.iml | 10 +
.idea/vcs.xml | 6 +
main.py | 147 ++++++++
8 files changed, 515 insertions(+)
create mode 100644 .gitignore
create mode 100644 .idea/.gitignore
create mode 100644 .idea/inspectionProfiles/profiles_settings.xml
create mode 100644 .idea/misc.xml
create mode 100644 .idea/modules.xml
create mode 100644 .idea/ocr.iml
create mode 100644 .idea/vcs.xml
create mode 100644 main.py
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..212f260
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,323 @@
+### VisualStudioCode template
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/*.code-snippets
+
+# Local History for Visual Studio Code
+.history/
+
+# Built Visual Studio Code Extensions
+*.vsix
+
+### JetBrains template
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# AWS User-specific
+.idea/**/aws.xml
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn. Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# SonarLint plugin
+.idea/sonarlint/
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based Rest Client
+.idea/httpRequests
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+### VirtualEnv template
+# Virtualenv
+# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
+.Python
+[Bb]in
+[Ii]nclude
+[Ll]ib
+[Ll]ib64
+[Ll]ocal
+[Ss]cripts
+pyvenv.cfg
+.venv
+pip-selfcheck.json
+
+### Windows template
+# Windows thumbnail cache files
+Thumbs.db
+Thumbs.db:encryptable
+ehthumbs.db
+ehthumbs_vista.db
+
+# Dump file
+*.stackdump
+
+# Folder config file
+[Dd]esktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Windows Installer files
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+
+# Windows shortcuts
+*.lnk
+
+### macOS template
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+# This is especially recommended for binary packages to ensure reproducibility, and is more
+# commonly ignored for libraries.
+# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+# in version control.
+# https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+# and can be added to the global gitignore or merged into this file. For a more nuclear
+# option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..35410ca
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,8 @@
+# 默认忽略的文件
+/shelf/
+/workspace.xml
+# 基于编辑器的 HTTP 客户端请求
+/httpRequests/
+# Datasource local storage ignored files
+/dataSources/
+/dataSources.local.xml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..870f088
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
new file mode 100644
index 0000000..68d7201
--- /dev/null
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/ocr.iml b/.idea/ocr.iml
new file mode 100644
index 0000000..2c80e12
--- /dev/null
+++ b/.idea/ocr.iml
@@ -0,0 +1,10 @@
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..94a25f7
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..b09ddb8
--- /dev/null
+++ b/main.py
@@ -0,0 +1,147 @@
+from fastapi import FastAPI, UploadFile, File, Query
+from fastapi.responses import JSONResponse
+import easyocr
+from typing import Optional
+import asyncio
+import aiohttp # 用于异步 HTTP 请求
+
+app = FastAPI()
+
+# 初始化常用的语言组合 OCR 读取器
+reader_sim = easyocr.Reader(['ch_sim', 'en'])
+
+# 存储 OCR 读取器的缓存字典
+readers = {}
+
+# 使用 asyncio 的异步锁
+lock = asyncio.Lock()
+
+
+# 获取或初始化 OCR 读取器
+async def get_reader(lang_1: Optional[str], lang_2: Optional[str]):
+ global readers, reader_sim, lock
+
+ # 语言组合
+ lang_combination = tuple(sorted([lang_1, lang_2] if lang_1 and lang_2 else [lang_1 or lang_2]))
+
+ # 默认使用中文简体和英文的 OCR 读取器
+ if not lang_1 and not lang_2:
+ return reader_sim, None # 没有错误
+
+ # 检查缓存中是否已经存在该语言组合的读取器
+ if lang_combination in readers:
+ return readers[lang_combination], None # 没有错误
+
+ # 使用锁避免并发问题,仅在初始化新读取器时需要锁
+ async with lock:
+ # 双重检查,确保其他请求在等待锁时未初始化该读取器
+ if lang_combination not in readers:
+ try:
+ # 尝试初始化 OCR 读取器
+ readers[lang_combination] = easyocr.Reader(list(lang_combination))
+ except ValueError as ve:
+ return None, str(ve) # 返回错误信息
+ except Exception as e:
+ return None, f"Unexpected error: {str(e)}" # 返回意外错误信息
+
+ return readers[lang_combination], None # 没有错误
+
+
+# 通用的 OCR 处理函数
+async def process_ocr(reader, image_data):
+ try:
+ # 执行 OCR 处理
+ result = reader.readtext(image_data)
+
+ # 创建一个用于存储提取信息的列表
+ output = []
+
+ # 遍历识别结果
+ for (bbox, text, confidence) in result:
+ # 将 bbox 中的每个坐标点转换为 Python 原生 int 类型
+ bbox = [[int(coord) for coord in point] for point in bbox]
+
+ entry = {
+ "text": text,
+ "bbox": bbox, # bounding box 位置信息
+ "confidence": confidence # 置信度
+ }
+ output.append(entry)
+
+ # 成功处理后返回数据
+ return {
+ "status": "success",
+ "message": "OCR 处理成功",
+ "data": output
+ }, 200
+ except Exception as e:
+ # 处理失败,返回错误信息
+ return {
+ "status": "error",
+ "message": "OCR 处理失败",
+ "error": str(e),
+ "data": None
+ }, 500
+
+
+@app.post("/ocr")
+async def ocr_image(lang_1: Optional[str] = None, lang_2: Optional[str] = None, image: UploadFile = File(...)):
+ # 获取 OCR 读取器并捕获可能的错误
+ reader, error = await get_reader(lang_1, lang_2)
+
+ if error:
+ return JSONResponse(status_code=500, content={
+ "status": "error",
+ "message": "OCR 处理失败",
+ "error": error,
+ "data": None
+ })
+
+ try:
+ # 获取图片的二进制数据
+ image_data = await image.read()
+ except Exception as e:
+ return JSONResponse(status_code=500, content={
+ "status": "error",
+ "message": "无法读取图片文件",
+ "error": str(e),
+ "data": None
+ })
+
+ # 调用通用的 OCR 处理函数
+ response_data, status_code = await process_ocr(reader, image_data)
+ return JSONResponse(status_code=status_code, content=response_data)
+
+
+@app.get("/ocr")
+async def ocr_image_from_url(lang_1: Optional[str] = None, lang_2: Optional[str] = None, url: str = Query(...)):
+ # 使用 aiohttp 进行异步 HTTP 请求
+ async with aiohttp.ClientSession() as session:
+ try:
+ async with session.get(url) as response:
+ if response.status != 200:
+ raise Exception(f"无法获取图片, HTTP 状态码: {response.status}")
+ # 读取图片数据
+ image_data = await response.read()
+ except Exception as e:
+ return JSONResponse(status_code=500, content={
+ "status": "error",
+ "message": "无法从 URL 获取图片",
+ "error": str(e),
+ "data": None
+ })
+
+ # 获取 OCR 读取器并捕获可能的错误
+ reader, error = await get_reader(lang_1, lang_2)
+
+ if error:
+ return JSONResponse(status_code=500, content={
+ "status": "error",
+ "message": "OCR 处理失败",
+ "error": error,
+ "data": None
+ })
+
+ # 调用通用的 OCR 处理函数
+ response_data, status_code = await process_ocr(reader, image_data)
+ return JSONResponse(status_code=status_code, content=response_data)