当前位置：首页 > news >正文

基于Tesseract_OCR识别

news 2025/7/2 11:44:12

1、安装Tesseract
Mac版本，通过Homebrew进行安装即可

brew install tesseract

windows版本安装

下载地址：https://digi.bib.uni-mannheim.de/tesseract/

2、更换语言包

下载语言包

https://github.com/tesseract-ocr/tesseract

亦可参照这个 Tesseract最新版语言包chi_sim.traineddata(4.0.0)GitHub官方获取免csdn积分，各个版本语言包全有_tesseract github releases-CSDN博客

3、程序

import os
from PIL import Image
import pytesseract
import re
import pandas as pd# 如果你使用 Windows，指定 Tesseract 的安装路径
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'# 1. 遍历文件夹中的所有图片并进行OCR识别
def process_images(folder_path, output_txt):with open(output_txt, 'w', encoding='utf-8') as f:for filename in os.listdir(folder_path):if filename.endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tiff')):image_path = os.path.join(folder_path, filename)print(f"Processing image: {filename}")img = Image.open(image_path)# 进行OCR识别text = pytesseract.image_to_string(img, lang='chi_sim')# 替换错误识别的符号为 ¥text = text.replace('#', '¥')text = text.replace('Y', '¥')text = text.replace('*', '¥')# 处理每一行中的多余空格lines = text.splitlines()processed_lines = []for line in lines:# 移除行内的字符间空格processed_line = re.sub(r'\s', '', line)processed_lines.append(processed_line)processed_text = '\n'.join(processed_lines)f.write(f"Image: {filename}\n")f.write(processed_text)f.write("\n" + "=" * 40 + "\n")# 处理完图片后，清理txt文件并保存为新的文件cleaned_txt_file = 'cleaned_' + output_txt  # 创建清理后的文件名clean_txt(output_txt, cleaned_txt_file)     # 调用清理函数print(f"Cleaned text saved to {cleaned_txt_file}")return cleaned_txt_file# 2. 清理txt文件中的无关内容，只保留指定信息
def clean_txt(txt_file, cleaned_txt_file):headers = ["券码", "券类型", "套餐内容", "验证时间", "消费金额", "消费明细", "订单号", "验券账号"]with open(txt_file, 'r', encoding='utf-8') as f:lines = f.readlines()with open(cleaned_txt_file, 'w', encoding='utf-8') as f:buffer = ''is_in_taocan = False  # 标识是否在"套餐内容"部分for line in lines:line = line.strip()if any(header in line for header in headers):if "套餐内容" in line:buffer += line  # 如果遇到"套餐内容"，开始处理is_in_taocan = Trueelse:if is_in_taocan:  # 如果之前是在"套餐内容"内f.write(buffer + '\n')  # 把完整的"套餐内容"写入文件buffer = ''  # 清空bufferis_in_taocan = False  # 退出"套餐内容"模式f.write(line + '\n')  # 写入其他字段elif is_in_taocan:# 如果是在处理"套餐内容"部分，合并多行内容buffer += line  # 合并到bufferelse:continue  # 跳过非关键信息行if buffer:  # 写入最后一部分的"套餐内容"（如果有）f.write(buffer + '\n')# 运行流程
image_folder = '/pytorchPeoject/testDemo1/img/'  # 图片文件夹路径
output_txt_file = 'output_02.txt'  # 临时存储OCR结果的txt文件# 执行图像处理和OCR识别，并清理txt文件
cleaned_txt_file = process_images(image_folder, output_txt_file)  # 获取清理后的文件路径# 打开并读取txt文件
with open(cleaned_txt_file, 'r', encoding='utf-8') as file:data = file.read()# 使用正则表达式提取需要的数据
pattern = re.compile(r'券码:(\d+)\s+券类型:(.+?)\s+套餐内容:(.+?)\s+验证时间:(\d{4}-\d{2}-\d{2})(\d{2}:\d{2}:\d{2})\s+消费金额:(.+?)\s+消费明细:(.+?)\s+订单号:(\d+)\s+验券账号:(\w+)')
matches = pattern.findall(data)# 创建DataFrame
columns = ['券码', '券类型', '套餐内容', '验证时间', '时间', '消费金额', '消费明细', '订单号', '验券账号']
df = pd.DataFrame(matches, columns=columns)# 合并验证时间和时间列
df['验证时间'] = df['验证时间'] + ' ' + df['时间']
df.drop('时间', axis=1, inplace=True)# 保存到Excel文件
df.to_excel('output.xlsx', index=False)print("数据提取并保存到Excel文件成功！")

查看全文

http://www.mrgr.cn/news/30765.html