Python - OCR
Revision as of 05:44, 10 February 2018 by Torben (talk | contribs) (Torben moved page Python - OTR to Python - OCR)
Optical Character Recognition (OCR) via pytesseract and Tesseract
import os import cv2 # c:\Python\Scripts\pip install opencv-python import pytesseract # Requirements pytesseract # 1. Tesseract https://github.com/tesseract-ocr/tesseract/wiki#windows # -> e:\win\progs\Tesseract-OCR\tesseract.exe # Windows Path muss e:\win\progs\Tesseract-OCR\ enthalten # Windows Umgebungs Variable TESSDATA_PREFIX -> e:\win\progs\Tesseract-OCR\ # # 2. pytesseract # c:\Python\Scripts\pip install pillow # c:\Python\Scripts\pip install pytesseract # Read Input file image = cv2.imread('image.png') # export temp file filename = "tmp/{}.png".format(os.getpid()) # mkdir tmp first! cv2.imwrite(filename, image) text = "" try: # character whitelist: digits only text = pytesseract.image_to_string(Image.open(filename), config='digits', lang="eng") except: # attention: wildcard exception print("ERROR Unexpected exception:", sys.exc_info()[0]) text = "" os.remove(filename) print(text) cv2.waitKey(0)