# Common functions for checking web and canvas for accessibility import os, sys, glob, codecs import subprocess, re, pdb, html from bs4 import BeautifulSoup, Comment import html.entities from datetime import datetime import pdb #from html.parser import HTMLParseError # the following from: https://chase-seibert.github.io/blog/2011/01/28/sanitize-html-with-beautiful-soup.html# # hasnt been tested yet def safe_html(html): if not html: return None # remove these tags, complete with contents. blacklist = ["script", "style" ] whitelist = [ "div", "span", "p", "br", "pre","a", "blockquote", "ul", "li", "ol", "b", "em", "i", "strong", "u", "iframe","img", "h1","h2","h3","h4","h5","h6" ] try: # BeautifulSoup is catching out-of-order and unclosed tags, so markup # can't leak out of comments and break the rest of the page. soup = BeautifulSoup(html,'lxml') except Exception as e: # special handling? raise e removelist = ['table','tbody','thead','th','tr','td'] # now strip HTML we don't like. for tag in soup.findAll(): if tag.name.lower()=='iframe': continue if tag.name.lower()=='img': continue if tag.name.lower() in blacklist: # blacklisted tags are removed in their entirety tag.extract() elif tag.name.lower() in whitelist: # tag is allowed. Make sure all the attributes are allowed. #print tag #print tag.attrs #pdb.set_trace() #tag.attrs = [(a[0], safe_css(a[0], a[1])) for a in tag.attrs if _attr_name_whitelisted(a[0])] for k,v in list(tag.attrs.items()): #print 'attr: ' + str(k) + ' = ' + str(v) + '.... ', if not _attr_name_whitelisted(k): tag.attrs.pop(k) #print ' removed' else: tag.attrs[k] = v #print ' kept' elif tag.name.lower() in removelist: tag.unwrap() else: # not a whitelisted tag. I'd like to remove it from the tree # and replace it with its children. But that's hard. It's much # easier to just replace it with an empty span tag. #tag.name = "span" #tag.attrs = [] tag.unwrap() # scripts can be executed from comments in some cases comments = soup.findAll(text=lambda text:isinstance(text, Comment)) for comment in comments: comment.extract() safe_html = str(soup) if safe_html == ", -": return None return safe_html def _attr_name_whitelisted(attr_name): return attr_name.lower() in ["href", "src","width","height","alt","target","title","class","id"] def safe_css(attr, css): if attr == "style": return re.sub("(width|height):[^;]+;", "", css) return css def plaintext(input): """Converts HTML to plaintext, preserving whitespace.""" # from http://effbot.org/zone/re-sub.htm#unescape-html def _unescape(text): def fixup(m): text = m.group(0) if text[:2] == "&#": # character reference try: if text[:3] == "&#x": return chr(int(text[3:-1], 16)) else: return chr(int(text[2:-1])) except ValueError: pass else: # named entity try: text = chr(html.entities.name2codepoint[text[1:-1]]) except KeyError: pass return text # leave as is return re.sub("&#?\w+;", fixup, text) input = safe_html(input) # basic sanitation first text = "".join(BeautifulSoup("%s" % input).body(text=True)) text = text.replace("xml version='1.0' encoding='%SOUP-ENCODING%'", "") # strip BS meta-data return _unescape(text) #os.system("node node_modules/pa11y/bin/pa11y.js --standard Section508 http://www.gavilan.edu/student/online") def check_folder(fname,path): report = '

' + fname + '

\n' number = -1 count = 0 try: for F in os.listdir(path+fname): #'assignments'):A cmd = "/usr/bin/node " + \ "/home/phowell/Documents/access/node_modules/pa11y/bin/pa11y.js --standard Section508 " + \ path + fname + "/" + F print(("" + path + fname + "/" + F)) output = subprocess.run(cmd, stdout=subprocess.PIPE, universal_newlines=True, shell=True, check=False) report += "

" + F + "

\n" line = output.stdout.split('\n')[-3] if re.search('No\sissues',line): pass #print("Got zero") else: m = re.search('(\d+)\sErr',line) if m: count += int(m.group(1)) lines = output.stdout.split("\n") #pdb.set_trace() lines = lines[4:] report += "

" + html.escape("\n".join(lines)) + "

\n\n\n" except Exception as e: print('finished with error or folder missing') print(e) return int(count), report def check_class(folder): path = "/home/phowell/hdd/SCRIPTS/everything-json/course_temps/" + folder + "/" class_report = "

Report on course: " + folder + "

\n\n" (cnt_a,rep_a) = check_folder('assignments',path) (cnt_p,rep_p) = check_folder('pages',path) class_report += rep_a class_report += rep_p #oo = open(path+'report.html','w') #oo.write(class_report) #oo.close() #print(class_report) return cnt_a+cnt_p, class_report def check_all(): hd_path = '/home/phowell/hdd/SCRIPTS/everything-json/' rep_f = open(hd_path+'report.html','w') rep_s = open(hd_path+'summary.html','w') rep_f.write('\n') listt = os.listdir('/home/phowell/hdd/SCRIPTS/everything-json/course_temps') #listt = ['course_4341',] # for testing for L in listt: print(('Directory is: '+L)) m = glob.glob('/home/phowell/hdd/SCRIPTS/everything-json/course_temps/' +L+'/*.txt') if m: name = m[0] else: name = 'unknown.txt' name = name.split('.')[0] name = name.split('/')[-1] print(('name is: ' + name)) (cnt,rep) = check_class(L) rep_f.write("

"+name+"

\n"+rep+"\n\n

\n\n") rep_f.flush() rep_s.write("("+str(cnt)+") Class: "+name+"
\n") rep_s.flush() if __name__ == "__main__": check_all() #print(('arguments: '+str(sys.argv))) # test """ file = 'course_temps/course_6862/pages/choose-the-right-browser.html' dir = 'course_temps/course_6862/pages/' #ff = open(file,'r').read() #print safe_html(ff) for file in os.listdir(dir): if re.search('_cleaned\.html',file): os.remove(dir+file) for file in os.listdir(dir): if file.endswith(".html"): newfname = re.sub('\.html','_cleaned.html',file) ff = codecs.open(dir+file,'r','utf-8').read() print(file) print(newfname) newf = codecs.open(dir+newfname,'w','utf-8') newf.write(safe_html(ff)) newf.close() """