canvasapp/docs.py


import codecs, os,regex, subprocess


def html_to_markdown(infile,out):
        cmd = f"pandoc -o \"./{out}\" -f html -t markdown \"./{infile}\""
        print(cmd)
        result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        if result.returncode != 0:
            print(f"Error occurred: {result.stderr.decode('utf-8')}")
        else:
            print(f"Successfully converted '{infile}' to '{out}'")


def pdf_to_html(infile,out):
    import PyPDF2

    pdf_file = open(infile, 'rb')
    pdf_reader = PyPDF2.PdfReader(pdf_file)

    text = ''
    for page_num in range(len(pdf_reader.pages)):
        page = pdf_reader.pages[page_num]
        text += page.extract_text()

    pdf_file.close()
    ofile = codecs.open(out,'w','utf-8')
    ofile.write(text)
    ofile.close()


def pdf_to_html2(infile,out):
    from pdfminer.high_level import extract_pages
    from pdfminer.layout import LTTextContainer
    import html

    ofile = codecs.open(out,'w','utf-8')

    print(infile)
    for page_layout in extract_pages(infile):
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                text = html.escape(element.get_text())  # sanitize the text for HTML
                ofile.write(f"<p>{text}</p>")  # wraps in HTML paragraph tags


def convert(filename=""):

    target_dir = 'cache/docs'
    ff = os.listdir(target_dir)

    if filename:
        parts = filename.split('.')
        OUTFILE = f"{parts[0]}.html"
        pdf_to_html(target_dir + "/" + filename, target_dir + "/" + OUTFILE)
        html_to_markdown( target_dir + "/" + OUTFILE, target_dir + "/" + parts[0] + ".md" )


    else:
        for INFILE in ff:
            if INFILE.endswith(".pdf"):
                parts = INFILE.split('.')
                OUTFILE = f"{parts[0]}.html"

                pdf_to_html(target_dir + "/" + INFILE, target_dir + "/" + OUTFILE)

                html_to_markdown( target_dir + "/" + OUTFILE, target_dir + "/" + parts[0] + ".md" )


def clean(fn):
# Open file and read contents
    with open(fn, 'r', encoding='utf-8') as myfile:
        data = myfile.read()

    # Replace unicode non-breaking space with a regular space
    data = data.replace('\u00A0', ' ')
    data = data.replace('\u00AD', '')
    data = data.replace('\u200B', '')

    # Write cleaned data back to file
    with open(fn, 'w', encoding='utf-8') as myfile:
        myfile.write(data)


def fix_line_breaks(fn):
    with codecs.open(fn, 'r', 'utf-8') as file:
        lines = file.readlines()

    new_lines = []
    paragraph = ''

    for line in lines:
        if line.strip() == '':
            # If the line is blank, it's the end of a paragraph
            new_lines.append(paragraph.strip())
            paragraph = ''
        else:
            # If the line is not blank, add it to the paragraph (extra space included for word separation)
            paragraph += line.strip() + ' '

    # Handle the last paragraph
    if paragraph != '':
        new_lines.append(paragraph.strip())

    fout = codecs.open(fn, 'w','utf-8')
    fout.write('\n'.join(new_lines))


fix_file = 'hyflex.md'
convert('hyflex.pdf')

clean(f'cache/docs/{fix_file}')

fix_line_breaks(f'cache/docs/{fix_file}')