226 lines
7.4 KiB
Python
226 lines
7.4 KiB
Python
# Common functions for checking web and canvas for accessibility
|
|
|
|
import os, sys, glob, codecs
|
|
import subprocess, re, pdb, html
|
|
from bs4 import BeautifulSoup, Comment
|
|
import html.entities
|
|
from datetime import datetime
|
|
import pdb
|
|
#from html.parser import HTMLParseError
|
|
|
|
|
|
# the following from: https://chase-seibert.github.io/blog/2011/01/28/sanitize-html-with-beautiful-soup.html#
|
|
# hasnt been tested yet
|
|
|
|
|
|
def safe_html(html):
|
|
|
|
if not html:
|
|
return None
|
|
|
|
# remove these tags, complete with contents.
|
|
blacklist = ["script", "style" ]
|
|
|
|
whitelist = [
|
|
"div", "span", "p", "br", "pre","a",
|
|
"blockquote",
|
|
"ul", "li", "ol",
|
|
"b", "em", "i", "strong", "u", "iframe","img",
|
|
"h1","h2","h3","h4","h5","h6"
|
|
]
|
|
|
|
try:
|
|
# BeautifulSoup is catching out-of-order and unclosed tags, so markup
|
|
# can't leak out of comments and break the rest of the page.
|
|
soup = BeautifulSoup(html,'lxml')
|
|
except Exception as e:
|
|
# special handling?
|
|
raise e
|
|
|
|
removelist = ['table','tbody','thead','th','tr','td']
|
|
|
|
# now strip HTML we don't like.
|
|
for tag in soup.findAll():
|
|
if tag.name.lower()=='iframe': continue
|
|
if tag.name.lower()=='img': continue
|
|
if tag.name.lower() in blacklist:
|
|
# blacklisted tags are removed in their entirety
|
|
tag.extract()
|
|
elif tag.name.lower() in whitelist:
|
|
# tag is allowed. Make sure all the attributes are allowed.
|
|
#print tag
|
|
#print tag.attrs
|
|
#pdb.set_trace()
|
|
#tag.attrs = [(a[0], safe_css(a[0], a[1])) for a in tag.attrs if _attr_name_whitelisted(a[0])]
|
|
for k,v in list(tag.attrs.items()):
|
|
#print 'attr: ' + str(k) + ' = ' + str(v) + '.... ',
|
|
if not _attr_name_whitelisted(k):
|
|
tag.attrs.pop(k)
|
|
#print ' removed'
|
|
else:
|
|
tag.attrs[k] = v
|
|
#print ' kept'
|
|
elif tag.name.lower() in removelist:
|
|
tag.unwrap()
|
|
else:
|
|
# not a whitelisted tag. I'd like to remove it from the tree
|
|
# and replace it with its children. But that's hard. It's much
|
|
# easier to just replace it with an empty span tag.
|
|
|
|
#tag.name = "span"
|
|
#tag.attrs = []
|
|
tag.unwrap()
|
|
|
|
# scripts can be executed from comments in some cases
|
|
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
|
|
for comment in comments:
|
|
comment.extract()
|
|
|
|
safe_html = str(soup)
|
|
|
|
if safe_html == ", -":
|
|
return None
|
|
|
|
return safe_html
|
|
|
|
def _attr_name_whitelisted(attr_name):
|
|
return attr_name.lower() in ["href", "src","width","height","alt","target","title","class","id"]
|
|
|
|
def safe_css(attr, css):
|
|
if attr == "style":
|
|
return re.sub("(width|height):[^;]+;", "", css)
|
|
return css
|
|
|
|
def plaintext(input):
|
|
"""Converts HTML to plaintext, preserving whitespace."""
|
|
|
|
# from http://effbot.org/zone/re-sub.htm#unescape-html
|
|
def _unescape(text):
|
|
def fixup(m):
|
|
text = m.group(0)
|
|
if text[:2] == "&#":
|
|
# character reference
|
|
try:
|
|
if text[:3] == "&#x":
|
|
return chr(int(text[3:-1], 16))
|
|
else:
|
|
return chr(int(text[2:-1]))
|
|
except ValueError:
|
|
pass
|
|
else:
|
|
# named entity
|
|
try:
|
|
text = chr(html.entities.name2codepoint[text[1:-1]])
|
|
except KeyError:
|
|
pass
|
|
return text # leave as is
|
|
return re.sub("&#?\w+;", fixup, text)
|
|
|
|
input = safe_html(input) # basic sanitation first
|
|
text = "".join(BeautifulSoup("<body>%s</body>" % input).body(text=True))
|
|
text = text.replace("xml version='1.0' encoding='%SOUP-ENCODING%'", "") # strip BS meta-data
|
|
return _unescape(text)
|
|
|
|
|
|
#os.system("node node_modules/pa11y/bin/pa11y.js --standard Section508 http://www.gavilan.edu/student/online")
|
|
|
|
|
|
def check_folder(fname,path):
|
|
report = '<h2>' + fname + '</h2>\n'
|
|
number = -1
|
|
count = 0
|
|
try:
|
|
for F in os.listdir(path+fname): #'assignments'):A
|
|
cmd = "/usr/bin/node " + \
|
|
"/home/phowell/Documents/access/node_modules/pa11y/bin/pa11y.js --standard Section508 " + \
|
|
path + fname + "/" + F
|
|
print(("" + path + fname + "/" + F))
|
|
output = subprocess.run(cmd, stdout=subprocess.PIPE,
|
|
universal_newlines=True, shell=True, check=False)
|
|
|
|
report += "<h3>" + F + "</h3>\n"
|
|
line = output.stdout.split('\n')[-3]
|
|
if re.search('No\sissues',line):
|
|
pass
|
|
#print("Got zero")
|
|
else:
|
|
m = re.search('(\d+)\sErr',line)
|
|
if m:
|
|
count += int(m.group(1))
|
|
lines = output.stdout.split("\n")
|
|
#pdb.set_trace()
|
|
lines = lines[4:]
|
|
report += "<pre>" + html.escape("\n".join(lines)) + "</pre>\n\n\n"
|
|
except Exception as e:
|
|
print('finished with error or folder missing')
|
|
print(e)
|
|
return int(count), report
|
|
|
|
def check_class(folder):
|
|
path = "/home/phowell/hdd/SCRIPTS/everything-json/course_temps/" + folder + "/"
|
|
class_report = "<h1>Report on course: " + folder + "</h1>\n\n"
|
|
(cnt_a,rep_a) = check_folder('assignments',path)
|
|
(cnt_p,rep_p) = check_folder('pages',path)
|
|
class_report += rep_a
|
|
class_report += rep_p
|
|
|
|
#oo = open(path+'report.html','w')
|
|
#oo.write(class_report)
|
|
#oo.close()
|
|
#print(class_report)
|
|
return cnt_a+cnt_p, class_report
|
|
|
|
def check_all():
|
|
hd_path = '/home/phowell/hdd/SCRIPTS/everything-json/'
|
|
|
|
rep_f = open(hd_path+'report.html','w')
|
|
rep_s = open(hd_path+'summary.html','w')
|
|
|
|
rep_f.write('<meta charset="utf-8"/>\n')
|
|
|
|
listt = os.listdir('/home/phowell/hdd/SCRIPTS/everything-json/course_temps')
|
|
#listt = ['course_4341',] # for testing
|
|
for L in listt:
|
|
print(('Directory is: '+L))
|
|
m = glob.glob('/home/phowell/hdd/SCRIPTS/everything-json/course_temps/' +L+'/*.txt')
|
|
if m: name = m[0]
|
|
else: name = 'unknown.txt'
|
|
name = name.split('.')[0]
|
|
name = name.split('/')[-1]
|
|
|
|
print(('name is: ' + name))
|
|
(cnt,rep) = check_class(L)
|
|
rep_f.write("<a name='"+L+"'><h1>"+name+"</h1>\n"+rep+"\n\n<br /><br />\n\n")
|
|
rep_f.flush()
|
|
rep_s.write("("+str(cnt)+") Class: <a href='report.html#"+L+"'>"+name+"</a><br />\n")
|
|
rep_s.flush()
|
|
|
|
if __name__ == "__main__":
|
|
check_all()
|
|
|
|
#print(('arguments: '+str(sys.argv)))
|
|
|
|
# test
|
|
"""
|
|
file = 'course_temps/course_6862/pages/choose-the-right-browser.html'
|
|
dir = 'course_temps/course_6862/pages/'
|
|
#ff = open(file,'r').read()
|
|
#print safe_html(ff)
|
|
|
|
for file in os.listdir(dir):
|
|
if re.search('_cleaned\.html',file):
|
|
os.remove(dir+file)
|
|
|
|
for file in os.listdir(dir):
|
|
if file.endswith(".html"):
|
|
newfname = re.sub('\.html','_cleaned.html',file)
|
|
ff = codecs.open(dir+file,'r','utf-8').read()
|
|
print(file)
|
|
print(newfname)
|
|
newf = codecs.open(dir+newfname,'w','utf-8')
|
|
newf.write(safe_html(ff))
|
|
newf.close()
|
|
"""
|
|
|
|
|