import codecs, os,regex, subprocess def html_to_markdown(infile,out): cmd = f"pandoc -o \"./{out}\" -f html -t markdown \"./{infile}\"" print(cmd) result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if result.returncode != 0: print(f"Error occurred: {result.stderr.decode('utf-8')}") else: print(f"Successfully converted '{infile}' to '{out}'") def pdf_to_html(infile,out): import PyPDF2 pdf_file = open(infile, 'rb') pdf_reader = PyPDF2.PdfReader(pdf_file) text = '' for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] text += page.extract_text() pdf_file.close() ofile = codecs.open(out,'w','utf-8') ofile.write(text) ofile.close() def pdf_to_html2(infile,out): from pdfminer.high_level import extract_pages from pdfminer.layout import LTTextContainer import html ofile = codecs.open(out,'w','utf-8') print(infile) for page_layout in extract_pages(infile): for element in page_layout: if isinstance(element, LTTextContainer): text = html.escape(element.get_text()) # sanitize the text for HTML ofile.write(f"

{text}

") # wraps in HTML paragraph tags def convert(filename=""): target_dir = 'cache/docs' ff = os.listdir(target_dir) if filename: parts = filename.split('.') OUTFILE = f"{parts[0]}.html" pdf_to_html(target_dir + "/" + filename, target_dir + "/" + OUTFILE) html_to_markdown( target_dir + "/" + OUTFILE, target_dir + "/" + parts[0] + ".md" ) else: for INFILE in ff: if INFILE.endswith(".pdf"): parts = INFILE.split('.') OUTFILE = f"{parts[0]}.html" pdf_to_html(target_dir + "/" + INFILE, target_dir + "/" + OUTFILE) html_to_markdown( target_dir + "/" + OUTFILE, target_dir + "/" + parts[0] + ".md" ) def clean(fn): # Open file and read contents with open(fn, 'r', encoding='utf-8') as myfile: data = myfile.read() # Replace unicode non-breaking space with a regular space data = data.replace('\u00A0', ' ') data = data.replace('\u00AD', '') data = data.replace('\u200B', '') # Write cleaned data back to file with open(fn, 'w', encoding='utf-8') as myfile: myfile.write(data) def fix_line_breaks(fn): with codecs.open(fn, 'r', 'utf-8') as file: lines = file.readlines() new_lines = [] paragraph = '' for line in lines: if line.strip() == '': # If the line is blank, it's the end of a paragraph new_lines.append(paragraph.strip()) paragraph = '' else: # If the line is not blank, add it to the paragraph (extra space included for word separation) paragraph += line.strip() + ' ' # Handle the last paragraph if paragraph != '': new_lines.append(paragraph.strip()) fout = codecs.open(fn, 'w','utf-8') fout.write('\n'.join(new_lines)) fix_file = 'hyflex.md' convert('hyflex.pdf') clean(f'cache/docs/{fix_file}') fix_line_breaks(f'cache/docs/{fix_file}')