canvasapp/docs.py

119 lines
3.3 KiB
Python

import codecs, os,regex, subprocess
def html_to_markdown(infile,out):
cmd = f"pandoc -o \"./{out}\" -f html -t markdown \"./{infile}\""
print(cmd)
result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if result.returncode != 0:
print(f"Error occurred: {result.stderr.decode('utf-8')}")
else:
print(f"Successfully converted '{infile}' to '{out}'")
def pdf_to_html(infile,out):
import PyPDF2
pdf_file = open(infile, 'rb')
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ''
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text += page.extract_text()
pdf_file.close()
ofile = codecs.open(out,'w','utf-8')
ofile.write(text)
ofile.close()
def pdf_to_html2(infile,out):
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextContainer
import html
ofile = codecs.open(out,'w','utf-8')
print(infile)
for page_layout in extract_pages(infile):
for element in page_layout:
if isinstance(element, LTTextContainer):
text = html.escape(element.get_text()) # sanitize the text for HTML
ofile.write(f"<p>{text}</p>") # wraps in HTML paragraph tags
def convert(filename=""):
target_dir = 'cache/docs'
ff = os.listdir(target_dir)
if filename:
parts = filename.split('.')
OUTFILE = f"{parts[0]}.html"
pdf_to_html(target_dir + "/" + filename, target_dir + "/" + OUTFILE)
html_to_markdown( target_dir + "/" + OUTFILE, target_dir + "/" + parts[0] + ".md" )
else:
for INFILE in ff:
if INFILE.endswith(".pdf"):
parts = INFILE.split('.')
OUTFILE = f"{parts[0]}.html"
pdf_to_html(target_dir + "/" + INFILE, target_dir + "/" + OUTFILE)
html_to_markdown( target_dir + "/" + OUTFILE, target_dir + "/" + parts[0] + ".md" )
def clean(fn):
# Open file and read contents
with open(fn, 'r', encoding='utf-8') as myfile:
data = myfile.read()
# Replace unicode non-breaking space with a regular space
data = data.replace('\u00A0', ' ')
data = data.replace('\u00AD', '')
data = data.replace('\u200B', '')
# Write cleaned data back to file
with open(fn, 'w', encoding='utf-8') as myfile:
myfile.write(data)
def fix_line_breaks(fn):
with codecs.open(fn, 'r', 'utf-8') as file:
lines = file.readlines()
new_lines = []
paragraph = ''
for line in lines:
if line.strip() == '':
# If the line is blank, it's the end of a paragraph
new_lines.append(paragraph.strip())
paragraph = ''
else:
# If the line is not blank, add it to the paragraph (extra space included for word separation)
paragraph += line.strip() + ' '
# Handle the last paragraph
if paragraph != '':
new_lines.append(paragraph.strip())
fout = codecs.open(fn, 'w','utf-8')
fout.write('\n'.join(new_lines))
fix_file = 'hyflex.md'
convert('hyflex.pdf')
clean(f'cache/docs/{fix_file}')
fix_line_breaks(f'cache/docs/{fix_file}')