Python - OCR

From Torben's Wiki
Revision as of 05:44, 10 February 2018 by Torben (talk | contribs) (Torben moved page Python - OTR to Python - OCR)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)


Optical Character Recognition (OCR) via pytesseract and Tesseract

import os
import cv2 # c:\Python\Scripts\pip install opencv-python

import pytesseract
# Requirements pytesseract
# 1. Tesseract  https://github.com/tesseract-ocr/tesseract/wiki#windows
# -> e:\win\progs\Tesseract-OCR\tesseract.exe
# Windows Path muss e:\win\progs\Tesseract-OCR\ enthalten
# Windows Umgebungs Variable TESSDATA_PREFIX ->  e:\win\progs\Tesseract-OCR\
#
# 2. pytesseract
# c:\Python\Scripts\pip install pillow
# c:\Python\Scripts\pip install pytesseract

# Read Input file 
image = cv2.imread('image.png')
# export temp file
filename = "tmp/{}.png".format(os.getpid()) # mkdir tmp first!
cv2.imwrite(filename, image)
text = ""
try:
    # character whitelist: digits only  
    text = pytesseract.image_to_string(Image.open(filename), config='digits', lang="eng")
except:  # attention: wildcard exception
    print("ERROR Unexpected exception:", sys.exc_info()[0])
    text = ""
os.remove(filename)
print(text)
cv2.waitKey(0)