-
Notifications
You must be signed in to change notification settings - Fork 0
/
PDFtoTXT.py
166 lines (147 loc) · 5.7 KB
/
PDFtoTXT.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#! ./pdf_to_txt/
#This is only functional and I know is POORLY written. I've no time and I'm doing this just for needs.
import os
import click
import shutil
from pdf2image import convert_from_path
from natsort import natsorted # because python string sorts is kinda bad tbh
import deep_translator
import progressbar
import inquirer as inq
def usrInput():
usrInput.title = str(input("Select the name of the file here\n") or "NONE")
usrInput.output_ocr_file = str(input("Select your .txt output name (without extension) \n") or "output_ocr_file") + ".txt"
usrInput.deleteCache = True
if not click.confirm('Delete folders?', default=True): # by default it will delete the folders
usrInput.deleteCache = False
usrInput.translate = False
if click.confirm(" Translate?", default= False):
usrInput.translate = True
def PDFtoPNG(): # convert pdf into multiple png's
pbar = progressbar.ProgressBar(widgets=['Reading... ', progressbar.AnimatedMarker()]).start()
# auto selection of pdf
pdf_file = []
for file in os.listdir('./'):
if file.endswith(".pdf"):
pdf_file.append(file)
if usrInput.title == "NONE":
usrInput.title = pdf_file[0]
try:
#print('creating the input folder\n')
os.mkdir('./.input')
except FileExistsError:
#print('input folder already generated\n')
pass
images = convert_from_path(f'{usrInput.title}', 200)
for i, image in enumerate(images):
pbar.update(i)
image.save(f'./.input/page_{i}.png')
pbar.finish()
#this loop selects only the desired format type
def fileSelector():
formattype = "png"
fileSelector.listfiles = []
for file in os.listdir('./.input'):
if file.endswith("." + formattype):
fileSelector.listfiles.append(file)
def genOutputfolder():
try:
#print('creating the output folder\n')
os.mkdir('./.output')
except FileExistsError:
#print('output folder already generated\n')
pass
# Tesseract main function
def ocrMain():
pbar = progressbar.ProgressBar(widgets=['Writing...',progressbar.SimpleProgress(),progressbar.Percentage(), progressbar.Bar(),
' ', progressbar.ETA()], maxval=len(fileSelector.listfiles)).start()
i = 0
for element in fileSelector.listfiles:
os.system('tesseract -l fra ' + './.input/'+ element + ' ./.output/' + element + '>/dev/null 2>&1') # the last part is for "disabling" output for tesseract
pbar.update(i)
i =i+1
pbar.finish()
#translate from gogle
#there's a 5,000 character limit on google translator :(
def genOutputTrans():
try:
#print('creating the output folder')
os.mkdir('./.output_translated')
except FileExistsError:
#print('output folder already generated')
pass
def translateOpt():
questions = [
inq.List('lang',
message="Select which language you want to use",
choices=['spanish', 'english', 'french','italian', 'portuguese', 'german']
),
]
answers = inq.prompt(questions)
translateOpt.answers = answers
def translatefromGoogle():
listtxt_trans = []
for file in os.listdir('./.output'):
if file.endswith('.txt'):
listtxt_trans.append('./.output/' + file)
pbar = progressbar.ProgressBar(widgets=['Translating...',progressbar.SimpleProgress(),progressbar.Percentage(), progressbar.Bar(),
' ', progressbar.ETA()], maxval=len(fileSelector.listfiles)).start()
i = 0
for file in natsorted(listtxt_trans):
try:
translated = deep_translator.GoogleTranslator(source='auto', target=translateOpt.answers['lang']).translate_file(file)
except deep_translator.exceptions.NotValidLength: #if it's over 5.000, ignore that file and print an error
translated = ""
print("Page #", i ," is over 5.000 characters. Ignoring")
pass
output_translated = open(f'./.output_translated/{i}.txt', 'w')
output_translated.write(translated)
output_translated.close()
pbar.update(i)
i = i+1 #sorry for this gibberish, but for some reason I can't find any better way
pbar.finish()
#merge all into one txt (Translated)
def mergeALLtxt():
listtxt = []
for file in os.listdir('./.output'):
if file.endswith('.txt'):
listtxt.append('./.output/' + file)
with open(usrInput.output_ocr_file,'wb') as wfd:
for f in natsorted(listtxt): # Sorted all pages
with open(f,'rb') as fd:
shutil.copyfileobj(fd, wfd)
#merge all into one txt (Translated)
def mergeALLtxtTranslated():
listtxt = []
for file in os.listdir('./.output_translated'):
if file.endswith('.txt'):
listtxt.append('./.output_translated/' + file)
with open(usrInput.output_ocr_file,'wb') as wfd:
for f in natsorted(listtxt): # Sorted all pages
with open(f,'rb') as fd:
shutil.copyfileobj(fd, wfd)
def rmFolder(folder_name):
if usrInput.deleteCache:
try:
shutil.rmtree(folder_name)
except FileNotFoundError:
print("Failed to delete", folder_name ,"Is already deleted or protected?")
def main():
usrInput()
if usrInput.translate:
translateOpt()
PDFtoPNG()
fileSelector()
genOutputfolder()
ocrMain() #This ends the default program and asks the user for translation
if usrInput.translate:
genOutputTrans()
translatefromGoogle()
mergeALLtxtTranslated()
else: mergeALLtxt()
rmFolder('./.output')
if usrInput.translate:
rmFolder('./.output_translated')
rmFolder('./.input')
if __name__ == "__main__":
main()