canvasapp/checker.py

# Common functions for checking web and canvas for accessibility

import os, sys, glob, codecs
import subprocess, re, pdb, html
from bs4 import BeautifulSoup, Comment
import html.entities
from datetime import datetime
import pdb
#from html.parser import HTMLParseError


# the following from: https://chase-seibert.github.io/blog/2011/01/28/sanitize-html-with-beautiful-soup.html#
# hasnt been tested yet


def safe_html(html):

    if not html:
        return None

    # remove these tags, complete with contents.
    blacklist = ["script", "style" ]

    whitelist = [
        "div", "span", "p", "br", "pre","a",
        "blockquote",
        "ul", "li", "ol",
        "b", "em", "i", "strong", "u", "iframe","img",
        "h1","h2","h3","h4","h5","h6"
        ]

    try:
        # BeautifulSoup is catching out-of-order and unclosed tags, so markup
        # can't leak out of comments and break the rest of the page.
        soup = BeautifulSoup(html,'lxml')
    except Exception as e:
        # special handling?
        raise e

    removelist = ['table','tbody','thead','th','tr','td']

    # now strip HTML we don't like.
    for tag in soup.findAll():
        if tag.name.lower()=='iframe': continue
        if tag.name.lower()=='img': continue
        if tag.name.lower() in blacklist:
            # blacklisted tags are removed in their entirety
            tag.extract()
        elif tag.name.lower() in whitelist:
            # tag is allowed. Make sure all the attributes are allowed.
            #print tag
            #print tag.attrs
            #pdb.set_trace()
            #tag.attrs = [(a[0], safe_css(a[0], a[1])) for a in tag.attrs if _attr_name_whitelisted(a[0])]
            for k,v in list(tag.attrs.items()):
                #print 'attr: ' + str(k) + ' = ' + str(v) + '.... ',
                if not _attr_name_whitelisted(k):
                    tag.attrs.pop(k)
                    #print ' removed'
                else:
                    tag.attrs[k] = v
                    #print ' kept'
        elif tag.name.lower() in removelist:
            tag.unwrap()
        else:
            # not a whitelisted tag. I'd like to remove it from the tree
            # and replace it with its children. But that's hard. It's much
            # easier to just replace it with an empty span tag.

            #tag.name = "span"
            #tag.attrs = []
            tag.unwrap()

    # scripts can be executed from comments in some cases
    comments = soup.findAll(text=lambda text:isinstance(text, Comment))
    for comment in comments:
        comment.extract()

    safe_html = str(soup)

    if safe_html == ", -":
        return None

    return safe_html

def _attr_name_whitelisted(attr_name):
    return attr_name.lower() in ["href", "src","width","height","alt","target","title","class","id"]

def safe_css(attr, css):
    if attr == "style":
        return re.sub("(width|height):[^;]+;", "", css)
    return css

def plaintext(input):
    """Converts HTML to plaintext, preserving whitespace."""

    # from http://effbot.org/zone/re-sub.htm#unescape-html
    def _unescape(text):
        def fixup(m):
            text = m.group(0)
            if text[:2] == "&#":
                # character reference
                try:
                    if text[:3] == "&#x":
                        return chr(int(text[3:-1], 16))
                    else:
                        return chr(int(text[2:-1]))
                except ValueError:
                    pass
            else:
                # named entity
                try:
                    text = chr(html.entities.name2codepoint[text[1:-1]])
                except KeyError:
                    pass
            return text # leave as is
        return re.sub("&#?\w+;", fixup, text)

    input = safe_html(input) # basic sanitation first
    text = "".join(BeautifulSoup("<body>%s</body>" % input).body(text=True))
    text = text.replace("xml version='1.0' encoding='%SOUP-ENCODING%'", "") # strip BS meta-data
    return _unescape(text)


#os.system("node node_modules/pa11y/bin/pa11y.js --standard Section508 http://www.gavilan.edu/student/online")


def check_folder(fname,path):
    report = '<h2>' + fname + '</h2>\n'
    number = -1
    count = 0
    try:
        for F in os.listdir(path+fname):    #'assignments'):A
            cmd = "/usr/bin/node " + \
                "/home/phowell/Documents/access/node_modules/pa11y/bin/pa11y.js --standard Section508 " + \
                path + fname + "/" + F
            print(("" + path + fname + "/" + F))
            output = subprocess.run(cmd, stdout=subprocess.PIPE,
                universal_newlines=True, shell=True, check=False)

            report += "<h3>" + F + "</h3>\n"
            line = output.stdout.split('\n')[-3]
            if re.search('No\sissues',line):
                pass
                #print("Got zero")
            else:
                m = re.search('(\d+)\sErr',line)
                if m:
                    count += int(m.group(1))
                lines = output.stdout.split("\n")
                #pdb.set_trace()
                lines = lines[4:]
                report += "<pre>" + html.escape("\n".join(lines)) + "</pre>\n\n\n"
    except Exception as e:
        print('finished with error or folder missing')
        print(e)
    return int(count), report

def check_class(folder):
    path = "/home/phowell/hdd/SCRIPTS/everything-json/course_temps/" + folder + "/"
    class_report = "<h1>Report on course: " + folder + "</h1>\n\n"
    (cnt_a,rep_a) = check_folder('assignments',path)
    (cnt_p,rep_p) = check_folder('pages',path)
    class_report += rep_a
    class_report += rep_p

    #oo = open(path+'report.html','w')
    #oo.write(class_report)
    #oo.close()
    #print(class_report)
    return cnt_a+cnt_p, class_report

def check_all():
    hd_path = '/home/phowell/hdd/SCRIPTS/everything-json/'

    rep_f = open(hd_path+'report.html','w')
    rep_s = open(hd_path+'summary.html','w')

    rep_f.write('<meta charset="utf-8"/>\n')

    listt = os.listdir('/home/phowell/hdd/SCRIPTS/everything-json/course_temps')
    #listt = ['course_4341',]       # for testing
    for L in listt:
        print(('Directory is: '+L))
        m = glob.glob('/home/phowell/hdd/SCRIPTS/everything-json/course_temps/' +L+'/*.txt')
        if m: name = m[0]
        else: name = 'unknown.txt'
        name = name.split('.')[0]
        name = name.split('/')[-1]

        print(('name is: ' + name))
        (cnt,rep) = check_class(L)
        rep_f.write("<a name='"+L+"'><h1>"+name+"</h1>\n"+rep+"\n\n<br /><br />\n\n")
        rep_f.flush()
        rep_s.write("("+str(cnt)+") Class: <a href='report.html#"+L+"'>"+name+"</a><br />\n")
        rep_s.flush()

if __name__ == "__main__":
    check_all()

    #print(('arguments: '+str(sys.argv)))

    # test
    """
    file = 'course_temps/course_6862/pages/choose-the-right-browser.html'
    dir = 'course_temps/course_6862/pages/'
    #ff = open(file,'r').read()
    #print safe_html(ff)

    for file in os.listdir(dir):
        if re.search('_cleaned\.html',file):
            os.remove(dir+file)

    for file in os.listdir(dir):
        if file.endswith(".html"):
            newfname = re.sub('\.html','_cleaned.html',file)
            ff = codecs.open(dir+file,'r','utf-8').read()
            print(file)
            print(newfname)
            newf = codecs.open(dir+newfname,'w','utf-8')
            newf.write(safe_html(ff))
            newf.close()
    """