canvasapp/checker.py

226 lines
7.4 KiB
Python

# Common functions for checking web and canvas for accessibility
import os, sys, glob, codecs
import subprocess, re, pdb, html
from bs4 import BeautifulSoup, Comment
import html.entities
from datetime import datetime
import pdb
#from html.parser import HTMLParseError
# the following from: https://chase-seibert.github.io/blog/2011/01/28/sanitize-html-with-beautiful-soup.html#
# hasnt been tested yet
def safe_html(html):
if not html:
return None
# remove these tags, complete with contents.
blacklist = ["script", "style" ]
whitelist = [
"div", "span", "p", "br", "pre","a",
"blockquote",
"ul", "li", "ol",
"b", "em", "i", "strong", "u", "iframe","img",
"h1","h2","h3","h4","h5","h6"
]
try:
# BeautifulSoup is catching out-of-order and unclosed tags, so markup
# can't leak out of comments and break the rest of the page.
soup = BeautifulSoup(html,'lxml')
except Exception as e:
# special handling?
raise e
removelist = ['table','tbody','thead','th','tr','td']
# now strip HTML we don't like.
for tag in soup.findAll():
if tag.name.lower()=='iframe': continue
if tag.name.lower()=='img': continue
if tag.name.lower() in blacklist:
# blacklisted tags are removed in their entirety
tag.extract()
elif tag.name.lower() in whitelist:
# tag is allowed. Make sure all the attributes are allowed.
#print tag
#print tag.attrs
#pdb.set_trace()
#tag.attrs = [(a[0], safe_css(a[0], a[1])) for a in tag.attrs if _attr_name_whitelisted(a[0])]
for k,v in list(tag.attrs.items()):
#print 'attr: ' + str(k) + ' = ' + str(v) + '.... ',
if not _attr_name_whitelisted(k):
tag.attrs.pop(k)
#print ' removed'
else:
tag.attrs[k] = v
#print ' kept'
elif tag.name.lower() in removelist:
tag.unwrap()
else:
# not a whitelisted tag. I'd like to remove it from the tree
# and replace it with its children. But that's hard. It's much
# easier to just replace it with an empty span tag.
#tag.name = "span"
#tag.attrs = []
tag.unwrap()
# scripts can be executed from comments in some cases
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
for comment in comments:
comment.extract()
safe_html = str(soup)
if safe_html == ", -":
return None
return safe_html
def _attr_name_whitelisted(attr_name):
return attr_name.lower() in ["href", "src","width","height","alt","target","title","class","id"]
def safe_css(attr, css):
if attr == "style":
return re.sub("(width|height):[^;]+;", "", css)
return css
def plaintext(input):
"""Converts HTML to plaintext, preserving whitespace."""
# from http://effbot.org/zone/re-sub.htm#unescape-html
def _unescape(text):
def fixup(m):
text = m.group(0)
if text[:2] == "&#":
# character reference
try:
if text[:3] == "&#x":
return chr(int(text[3:-1], 16))
else:
return chr(int(text[2:-1]))
except ValueError:
pass
else:
# named entity
try:
text = chr(html.entities.name2codepoint[text[1:-1]])
except KeyError:
pass
return text # leave as is
return re.sub("&#?\w+;", fixup, text)
input = safe_html(input) # basic sanitation first
text = "".join(BeautifulSoup("<body>%s</body>" % input).body(text=True))
text = text.replace("xml version='1.0' encoding='%SOUP-ENCODING%'", "") # strip BS meta-data
return _unescape(text)
#os.system("node node_modules/pa11y/bin/pa11y.js --standard Section508 http://www.gavilan.edu/student/online")
def check_folder(fname,path):
report = '<h2>' + fname + '</h2>\n'
number = -1
count = 0
try:
for F in os.listdir(path+fname): #'assignments'):A
cmd = "/usr/bin/node " + \
"/home/phowell/Documents/access/node_modules/pa11y/bin/pa11y.js --standard Section508 " + \
path + fname + "/" + F
print(("" + path + fname + "/" + F))
output = subprocess.run(cmd, stdout=subprocess.PIPE,
universal_newlines=True, shell=True, check=False)
report += "<h3>" + F + "</h3>\n"
line = output.stdout.split('\n')[-3]
if re.search('No\sissues',line):
pass
#print("Got zero")
else:
m = re.search('(\d+)\sErr',line)
if m:
count += int(m.group(1))
lines = output.stdout.split("\n")
#pdb.set_trace()
lines = lines[4:]
report += "<pre>" + html.escape("\n".join(lines)) + "</pre>\n\n\n"
except Exception as e:
print('finished with error or folder missing')
print(e)
return int(count), report
def check_class(folder):
path = "/home/phowell/hdd/SCRIPTS/everything-json/course_temps/" + folder + "/"
class_report = "<h1>Report on course: " + folder + "</h1>\n\n"
(cnt_a,rep_a) = check_folder('assignments',path)
(cnt_p,rep_p) = check_folder('pages',path)
class_report += rep_a
class_report += rep_p
#oo = open(path+'report.html','w')
#oo.write(class_report)
#oo.close()
#print(class_report)
return cnt_a+cnt_p, class_report
def check_all():
hd_path = '/home/phowell/hdd/SCRIPTS/everything-json/'
rep_f = open(hd_path+'report.html','w')
rep_s = open(hd_path+'summary.html','w')
rep_f.write('<meta charset="utf-8"/>\n')
listt = os.listdir('/home/phowell/hdd/SCRIPTS/everything-json/course_temps')
#listt = ['course_4341',] # for testing
for L in listt:
print(('Directory is: '+L))
m = glob.glob('/home/phowell/hdd/SCRIPTS/everything-json/course_temps/' +L+'/*.txt')
if m: name = m[0]
else: name = 'unknown.txt'
name = name.split('.')[0]
name = name.split('/')[-1]
print(('name is: ' + name))
(cnt,rep) = check_class(L)
rep_f.write("<a name='"+L+"'><h1>"+name+"</h1>\n"+rep+"\n\n<br /><br />\n\n")
rep_f.flush()
rep_s.write("("+str(cnt)+") Class: <a href='report.html#"+L+"'>"+name+"</a><br />\n")
rep_s.flush()
if __name__ == "__main__":
check_all()
#print(('arguments: '+str(sys.argv)))
# test
"""
file = 'course_temps/course_6862/pages/choose-the-right-browser.html'
dir = 'course_temps/course_6862/pages/'
#ff = open(file,'r').read()
#print safe_html(ff)
for file in os.listdir(dir):
if re.search('_cleaned\.html',file):
os.remove(dir+file)
for file in os.listdir(dir):
if file.endswith(".html"):
newfname = re.sub('\.html','_cleaned.html',file)
ff = codecs.open(dir+file,'r','utf-8').read()
print(file)
print(newfname)
newf = codecs.open(dir+newfname,'w','utf-8')
newf.write(safe_html(ff))
newf.close()
"""