I have two python codes. The first code is designed to split pdf to images then crop each image into four parts and the final output of images is put in Stickers
folder
from pathlib import Path
import shutil
import fitz
from PIL import Image
import cv2, os
def crop(image_path, coords, saved_location):
image_obj = Image.open(image_path)
cropped_image = image_obj.crop(coords)
cropped_image.save(saved_location)
def all_white_pixels(img):
H, W = img.shape[:2]
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
pixels = cv2.countNonZero(thresh)
return True if pixels == (H * W) else False
BASE_DIR = Path.cwd()
OUTPUT_DIR = BASE_DIR / "Output"
STICKERS_DIR = BASE_DIR / "Stickers"
try:
shutil.rmtree(OUTPUT_DIR)
except OSError as e:
pass
try:
shutil.rmtree(STICKERS_DIR)
except:
pass
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
STICKERS_DIR.mkdir(parents=True, exist_ok=True)
doc = fitz.open("Sample.pdf")
for page in doc:
pix = page.get_pixmap()
img = "Output/Page-%i.png" % page.number
pix.save(img)
s1 = 'Stickers/Page-%i-Cropped1.png' % page.number
s2 = 'Stickers/Page-%i-Cropped2.png' % page.number
s3 = 'Stickers/Page-%i-Cropped3.png' % page.number
s4 = 'Stickers/Page-%i-Cropped4.png' % page.number
crop(img, (34, 6, 588, 192), s1)
crop(img, (34, 192, 588, 373), s2)
crop(img, (34, 373, 588, 550), s3)
crop(img, (34, 550, 588, 730), s4)
img1 = cv2.imread(s1)
if all_white_pixels(img1):
os.remove(s1)
img2 = cv2.imread(s2)
if all_white_pixels(img2):
os.remove(s2)
img3 = cv2.imread(s3)
if all_white_pixels(img3):
os.remove(s3)
img4 = cv2.imread(s4)
if all_white_pixels(img4):
os.remove(s4)
And I have another python code that reads the numbers on the stickers then rename those images according to the number
import os, pytesseract
import cv2
sTemp = "Temp.png"
directory = '.\Stickers'
def useMagick(img):
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
command = 'magick convert {} -resize 2048x640 -density 200 -quality 100 {}'.format(img, sTemp)
os.system(command)
def readNumber(img):
img = cv2.imread(img)
gry = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
txt = pytesseract.image_to_string(gry)
try:
return re.findall(r'\d+\s?\/\s?(\d+)', txt)[0]
except:
blur = cv2.GaussianBlur(gry, (3,3), 0)
txt = pytesseract.image_to_string(blur)
try:
return re.findall(r'\d+\s?\/\s?(\d+)', txt)[0]
except:
return "REVIEW"
for filename in os.listdir(directory):
if filename.endswith(".png"):
sPath = os.path.join(directory, filename)
useMagick(sPath)
x = readNumber(sTemp)
try:
print(x)
try:
os.rename(sPath, os.path.join(os.getcwd(), directory, x + '.png'))
except:
n+=1
os.rename(sPath, os.path.join(os.getcwd(), directory, x + '_' + str(n) + '.png'))
continue
except:
print(sPath)
else:
continue
Both codes are working very well but separately. When trying to combine both of them, I didn’t get the desired output and the files are not renamed. Here’s my final code that didn’t work well
from pathlib import Path
import shutil
import fitz
from PIL import Image
import cv2, os, pytesseract
import time
def crop(image_path, coords, saved_location):
image_obj = Image.open(image_path)
cropped_image = image_obj.crop(coords)
cropped_image.save(saved_location)
def all_white_pixels(img):
H, W = img.shape[:2]
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
pixels = cv2.countNonZero(thresh)
return True if pixels == (H * W) else False
BASE_DIR = Path.cwd()
OUTPUT_DIR = BASE_DIR / "Output"
STICKERS_DIR = BASE_DIR / "Stickers"
try:
shutil.rmtree(OUTPUT_DIR)
except OSError as e:
pass
try:
shutil.rmtree(STICKERS_DIR)
except:
pass
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
STICKERS_DIR.mkdir(parents=True, exist_ok=True)
doc = fitz.open("Sample.pdf")
for page in doc:
pix = page.get_pixmap()
img = "Output/Page-%i.png" % page.number
pix.save(img)
s1 = 'Stickers/Page-%i-Cropped1.png' % page.number
s2 = 'Stickers/Page-%i-Cropped2.png' % page.number
s3 = 'Stickers/Page-%i-Cropped3.png' % page.number
s4 = 'Stickers/Page-%i-Cropped4.png' % page.number
crop(img, (34, 6, 588, 192), s1)
crop(img, (34, 192, 588, 373), s2)
crop(img, (34, 373, 588, 550), s3)
crop(img, (34, 550, 588, 730), s4)
img1 = cv2.imread(s1)
if all_white_pixels(img1):
os.remove(s1)
img2 = cv2.imread(s2)
if all_white_pixels(img2):
os.remove(s2)
img3 = cv2.imread(s3)
if all_white_pixels(img3):
os.remove(s3)
img4 = cv2.imread(s4)
if all_white_pixels(img4):
os.remove(s4)
time.sleep(5)
sTemp = "Temp.png"
directory = '.\Stickers'
def useMagick(img):
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
command = 'magick convert {} -resize 2048x640 -density 200 -quality 100 {}'.format(img, sTemp)
os.system(command)
def readNumber(img):
img = cv2.imread(img)
gry = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
txt = pytesseract.image_to_string(gry)
try:
return re.findall(r'\d+\s?\/\s?(\d+)', txt)[0]
except:
blur = cv2.GaussianBlur(gry, (3,3), 0)
txt = pytesseract.image_to_string(blur)
try:
return re.findall(r'\d+\s?\/\s?(\d+)', txt)[0]
except:
return "REVIEW"
for filename in os.listdir(directory):
if filename.endswith(".png"):
sPath = os.path.join(directory, filename)
useMagick(sPath)
x = readNumber(sTemp)
try:
print(x)
try:
os.rename(sPath, os.path.join(os.getcwd(), directory, x + '.png'))
except:
n+=1
os.rename(sPath, os.path.join(os.getcwd(), directory, x + '_' + str(n) + '.png'))
continue
except:
print(sPath)
else:
continue
The code is working as for the first (the pdf is split into images and the images are split into chops) but it doesn’t work as for renaming the images according to the numbers extracted by ocr Here’s a sample pdf file to test the codes on it https://www.mediafire.com/file/cy6xwjkhc1td2zg/Sample.pdf/file
Code:
And I have another python code that reads the numbers on the stickers then rename those images according to the number
Code:
Code: