119 lines
3.3 KiB
Python
119 lines
3.3 KiB
Python
|
|
import codecs, os,regex, subprocess
|
|
|
|
|
|
def html_to_markdown(infile,out):
|
|
cmd = f"pandoc -o \"./{out}\" -f html -t markdown \"./{infile}\""
|
|
print(cmd)
|
|
result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
if result.returncode != 0:
|
|
print(f"Error occurred: {result.stderr.decode('utf-8')}")
|
|
else:
|
|
print(f"Successfully converted '{infile}' to '{out}'")
|
|
|
|
|
|
def pdf_to_html(infile,out):
|
|
import PyPDF2
|
|
|
|
pdf_file = open(infile, 'rb')
|
|
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
|
|
|
text = ''
|
|
for page_num in range(len(pdf_reader.pages)):
|
|
page = pdf_reader.pages[page_num]
|
|
text += page.extract_text()
|
|
|
|
pdf_file.close()
|
|
ofile = codecs.open(out,'w','utf-8')
|
|
ofile.write(text)
|
|
ofile.close()
|
|
|
|
|
|
def pdf_to_html2(infile,out):
|
|
from pdfminer.high_level import extract_pages
|
|
from pdfminer.layout import LTTextContainer
|
|
import html
|
|
|
|
ofile = codecs.open(out,'w','utf-8')
|
|
|
|
print(infile)
|
|
for page_layout in extract_pages(infile):
|
|
for element in page_layout:
|
|
if isinstance(element, LTTextContainer):
|
|
text = html.escape(element.get_text()) # sanitize the text for HTML
|
|
ofile.write(f"<p>{text}</p>") # wraps in HTML paragraph tags
|
|
|
|
|
|
|
|
def convert(filename=""):
|
|
|
|
target_dir = 'cache/docs'
|
|
ff = os.listdir(target_dir)
|
|
|
|
if filename:
|
|
parts = filename.split('.')
|
|
OUTFILE = f"{parts[0]}.html"
|
|
pdf_to_html(target_dir + "/" + filename, target_dir + "/" + OUTFILE)
|
|
html_to_markdown( target_dir + "/" + OUTFILE, target_dir + "/" + parts[0] + ".md" )
|
|
|
|
|
|
else:
|
|
for INFILE in ff:
|
|
if INFILE.endswith(".pdf"):
|
|
parts = INFILE.split('.')
|
|
OUTFILE = f"{parts[0]}.html"
|
|
|
|
pdf_to_html(target_dir + "/" + INFILE, target_dir + "/" + OUTFILE)
|
|
|
|
html_to_markdown( target_dir + "/" + OUTFILE, target_dir + "/" + parts[0] + ".md" )
|
|
|
|
|
|
def clean(fn):
|
|
# Open file and read contents
|
|
with open(fn, 'r', encoding='utf-8') as myfile:
|
|
data = myfile.read()
|
|
|
|
# Replace unicode non-breaking space with a regular space
|
|
data = data.replace('\u00A0', ' ')
|
|
data = data.replace('\u00AD', '')
|
|
data = data.replace('\u200B', '')
|
|
|
|
# Write cleaned data back to file
|
|
with open(fn, 'w', encoding='utf-8') as myfile:
|
|
myfile.write(data)
|
|
|
|
|
|
def fix_line_breaks(fn):
|
|
with codecs.open(fn, 'r', 'utf-8') as file:
|
|
lines = file.readlines()
|
|
|
|
new_lines = []
|
|
paragraph = ''
|
|
|
|
for line in lines:
|
|
if line.strip() == '':
|
|
# If the line is blank, it's the end of a paragraph
|
|
new_lines.append(paragraph.strip())
|
|
paragraph = ''
|
|
else:
|
|
# If the line is not blank, add it to the paragraph (extra space included for word separation)
|
|
paragraph += line.strip() + ' '
|
|
|
|
# Handle the last paragraph
|
|
if paragraph != '':
|
|
new_lines.append(paragraph.strip())
|
|
|
|
fout = codecs.open(fn, 'w','utf-8')
|
|
fout.write('\n'.join(new_lines))
|
|
|
|
|
|
fix_file = 'hyflex.md'
|
|
convert('hyflex.pdf')
|
|
|
|
clean(f'cache/docs/{fix_file}')
|
|
|
|
fix_line_breaks(f'cache/docs/{fix_file}')
|
|
|
|
|
|
|