Differences

This shows you the differences between two versions of the page.

--- development:python:ocr [2024/08/30 03:53] – [OCR an Index with PyTesseract] tungnt
+++ development:python:ocr [2024/08/31 01:47] (current) – [Tessereact] tungnt
@@ Line 46: / Line 46: @@
 <code bash>
 pip install pytesseract
+</code>
+**Language:**
+  * https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html
+  * https://www.kaggle.com/code/dhorvay/pytesseract-multiple-languages
+<code python>
+print(pytesseract.get_languages(config='.'))
+['afr', 'amh', 'ara', 'asm', 'aze', 'aze_cyrl', 'bel', 'ben', 'bod', 'bos', 'bre', 'bul', 'cat', 'ceb', 'ces', 'chi_sim', 'chi_sim_vert', 'chi_tra', 'chi_tra_vert', 'chr', 'cos', 'cym', 'dan', 'deu', 'div', 'dzo', 'ell', 'eng', 'enm', 'epo', 'equ', 'est', 'eus', 'fao', 'fas', 'fil', 'fin', 'fra', 'frk', 'frm', 'fry', 'gla', 'gle', 'glg', 'grc', 'guj', 'hat', 'heb', 'hin', 'hrv', 'hun', 'hye', 'iku', 'ind', 'isl', 'ita', 'ita_old', 'jav', 'jpn', 'jpn_vert', 'kan', 'kat', 'kat_old', 'kaz', 'khm', 'kir', 'kmr', 'kor', 'kor_vert', 'lao', 'lat', 'lav', 'lit', 'ltz', 'mal', 'mar', 'mkd', 'mlt', 'mon', 'mri', 'msa', 'mya', 'nep', 'nld', 'nor', 'oci', 'ori', 'osd', 'pan', 'pol', 'por', 'pus', 'que', 'ron', 'rus', 'san', 'sin', 'slk', 'slv', 'snd', 'snum', 'spa', 'spa_old', 'sqi', 'srp', 'srp_latn', 'sun', 'swa', 'swe', 'syr', 'tam', 'tat', 'tel', 'tgk', 'tha', 'tir', 'ton', 'tur', 'uig', 'ukr', 'urd', 'uzb', 'uzb_cyrl', 'vie', 'yid', 'yor']
+from langdetect import detect_langs
+detect_langs(ocr_result_original)
+tesseract --list-langs
 </code>
 ====== How to Open an Image in Python with PIL (Pillow) ======
@@ Line 455: / Line 471: @@
 </file>
 ====== OCR a Text with Marginalia by Extracting the Body ======
+{{ :development:python:sample_mgh.jpg |}}
 https://github.com/1sitevn/python-jupyter/blob/main/ocr/06_OCR_a_Text_with_Marginalia_by_Extracting_the_Body.ipynb
@@ Line 498: / Line 516: @@
 ====== Separate a Footnote from Body Text ======
+https://github.com/1sitevn/python-jupyter/blob/main/ocr/07_OCR_Separate_a_Footnote_from_Body_Text.ipynb
 <file python>
+import cv2
+import pytesseract
+from PIL import Image
+image = cv2.imread('../data/ocr/sample_mgh_2.jpg')
+base_image = image.copy()
+gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+blur = cv2.GaussianBlur(gray, (7,7), 0)
+thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
+# Create rectangular structuring element and dilate
+kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,25))
+dilate = cv2.dilate(thresh, kernel, iterations=1)
+# Find contours and draw rectangle
+cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+cnts = cnts[0] if len(cnts) == 2 else cnts[1]
+cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[1])
+main_text = ""
+for c in cnts:
+    x,y,w,h = cv2.boundingRect(c)
+    if h > 200 and w > 250:
+        roi = base_image[y:y+h, 0:x]
+#         cv2.rectangle(image, (0, y), (x, 0 + h+20), (36,255,12), 2)
+        constant= cv2.copyMakeBorder(roi.copy(),30,30,30,30,cv2.BORDER_CONSTANT,value=[255,255,255])
+        ocr_result = pytesseract.image_to_string(constant)
+        cv2.imwrite("../data/temp/output.png", roi)
+        print (ocr_result)
+#         print (ocr_result)
+# cv2.imwrite("temp/output.png", image)
 </file>
@@ Line 508: / Line 561: @@
   * https://www.affinda.com/tech-ai/how-to-convert-image-to-text-using-python
   * https://tesseract-ocr.github.io/tessdoc/Compiling.html#macos
+  * https://nanonets.com/blog/ocr-with-tesseract/