862 lines
31 KiB
Python
862 lines
31 KiB
Python
|
|
|
|
#saved_titles = json.loads( codecs.open('cache/saved_youtube_titles.json','r','utf-8').read() )
|
|
import requests, codecs, os, re, json
|
|
from pipelines import header, fetch, url, put_file
|
|
from util import clean_title, to_file_friendly, minimal_string
|
|
from bs4 import BeautifulSoup as bs
|
|
from html.parser import HTMLParser
|
|
import tomd, checker
|
|
import html2markdown as h2m
|
|
import pypandoc
|
|
import webbrowser
|
|
h = HTMLParser()
|
|
|
|
|
|
DBG = 1
|
|
|
|
def d(s):
|
|
global DBG
|
|
if DBG: print(s)
|
|
|
|
def stripper(s):
|
|
REMOVE_ATTRIBUTES = [
|
|
'lang','language','onmouseover','onmouseout','script','style','font',
|
|
'dir','face','size','color','style','class','width','height','hspace',
|
|
'border','valign','align','background','bgcolor','text','link','vlink',
|
|
'alink','cellpadding','cellspacing']
|
|
|
|
#doc = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is <i>paragraph</i> <a onmouseout="">one</a>.<p id="secondpara" align="blah">This is <i>paragraph</i> <b>two</b>.</html>'''
|
|
soup = bs(s, features='lxml')
|
|
for tag in soup.recursiveChildGenerator():
|
|
try:
|
|
tag.attrs = {key:value for key,value in tag.attrs.iteritems()
|
|
if key not in REMOVE_ATTRIBUTES}
|
|
except AttributeError:
|
|
# 'NavigableString' object has no attribute 'attrs'
|
|
pass
|
|
return soup.prettify()
|
|
|
|
def mycleaner(s):
|
|
s = re.sub(r'<br\s?\/>','\n',s)
|
|
s = re.sub(r'<\/?b>','',s)
|
|
s = re.sub(r' +',' ',s)
|
|
s = re.sub(r'^[\s\t\r\n]+$','',s,flags=re.MULTILINE)
|
|
s = re.sub('^ ','',s)
|
|
return s
|
|
|
|
def freshdesk():
|
|
path = "C:\\Users\\peter\\Downloads\\freshdesk\\Solutions.xml"
|
|
soup = bs( codecs.open(path,'r','utf-8').read() ,features="lxml")
|
|
|
|
outpt = codecs.open('cache/faqs.txt','w')
|
|
out = ""
|
|
for a in soup.find_all('solution-article'):
|
|
|
|
print("TITLE\n"+a.find('title').get_text())
|
|
out += a.find('title').get_text()
|
|
|
|
"""for d in a.find_all('description'):
|
|
#print(d)
|
|
if d:
|
|
d = h.unescape(d.get_text())
|
|
e = stripper(d)
|
|
m = tomd.convert( e )
|
|
m = mycleaner(m)
|
|
print("\nDESCRIPTION\n"+m)"""
|
|
|
|
#print("\nWHAT IS THIS?\n" +
|
|
hh = a.find('desc-un-html').get_text()
|
|
d = h.unescape(hh)
|
|
e = stripper(d)
|
|
m = tomd.convert( e )
|
|
m = mycleaner(m)
|
|
print("\nDESCRIPTION\n"+m)
|
|
out += "\n\n" + m + "\n\n"
|
|
|
|
print("-----------\n\n")
|
|
outpt.write(out)
|
|
|
|
# Download everything interesting in a course to a local folder
|
|
# Build a master file with the entire class content
|
|
def accessible_check(id=""):
|
|
if not id:
|
|
id = input("ID of course to check? ")
|
|
pagebreak = '\n\n<!-- BREAK -->\n\n'
|
|
verbose = 1
|
|
|
|
save_file_types = ['application/pdf','application/docx','image/jpg','image/png','image/gif','image/webp','application/vnd.openxmlformats-officedocument.wordprocessingml.document']
|
|
|
|
courseinfo = fetch('/api/v1/courses/' + str(id), verbose )
|
|
|
|
item_id_to_index = {}
|
|
items_inorder = ["<font size='24'>" + courseinfo['name'] + "</font>\n\n" + pagebreak,]
|
|
running_index = 1
|
|
|
|
modules = fetch('/api/v1/courses/' + str(id) + '/modules',verbose)
|
|
|
|
items = []
|
|
for x in range(9000): items.append(0)
|
|
|
|
video_link_list = []
|
|
|
|
for m in modules:
|
|
items[running_index] = '<h2>%s</h2>%s\n' % ( m['name'], pagebreak )
|
|
running_index += 1
|
|
|
|
mod_items = fetch('/api/v1/courses/' + str(id) + '/modules/'+str(m['id'])+'/items', verbose)
|
|
|
|
for I in mod_items:
|
|
|
|
if I['type'] in ['SubHeader', 'Page', 'Quiz', 'Discussion', 'ExternalUrl' ] or 'content_id' in I:
|
|
running_index += 1
|
|
|
|
if I['type'] == 'SubHeader':
|
|
#print('subheader: ' + str(I))
|
|
items[running_index] = '<h3>%s</h3>\n' % str(json.dumps(I,indent=2))
|
|
|
|
if I['type'] == 'Page':
|
|
item_id_to_index[ I['page_url'] ] = running_index
|
|
|
|
if I['type'] == 'Quiz':
|
|
item_id_to_index[ I['content_id'] ] = running_index
|
|
|
|
if I['type'] == 'Discussion':
|
|
item_id_to_index[ I['content_id'] ] = running_index
|
|
|
|
if I['type'] == 'ExternalUrl':
|
|
items[running_index] = "<a href='%s'>%s</a><br />\n\n" % (I['external_url'], I['title'])
|
|
|
|
# ?
|
|
#if 'content_id' in I:
|
|
# item_id_to_index[ I['content_id'] ] = running_index
|
|
else:
|
|
print("What is this item? " + str(I))
|
|
|
|
|
|
#items_inorder.append('<i>Not included: '+ I['title'] + '(a ' + I['type'] + ')</i>\n\n\n' )
|
|
|
|
# I['title']
|
|
# I['content_id']
|
|
# I['page_url']
|
|
# I['type']
|
|
# I['published']
|
|
# assignments and files have content_id, pages have page_url
|
|
|
|
course_folder = '../course_temps/course_'+id
|
|
index = []
|
|
try:
|
|
os.mkdir(course_folder)
|
|
except:
|
|
print("Course folder exists.")
|
|
###
|
|
### FILES
|
|
###
|
|
files_f = course_folder + '/files'
|
|
headered = 0
|
|
print("\nFILES")
|
|
try:
|
|
os.mkdir(files_f)
|
|
except:
|
|
print(" * Files folder already exists.")
|
|
|
|
files = fetch('/api/v1/courses/' + str(id) + '/files', verbose)
|
|
print("LISTING COURSE FILES")
|
|
for f in files:
|
|
for arg in 'filename,content-type,size,url'.split(','):
|
|
if arg=='size':
|
|
f['size'] = str(int(f['size']) / 1000) + 'k'
|
|
|
|
if f['content-type'] in save_file_types:
|
|
d(' - %s' % f['filename'])
|
|
|
|
if not os.path.exists(files_f + '/' + f['filename']):
|
|
r = requests.get(f['url'],headers=header, stream=True)
|
|
with open(files_f + '/' + f['filename'], 'wb') as fd:
|
|
for chunk in r.iter_content(chunk_size=128):
|
|
fd.write(chunk)
|
|
else:
|
|
d(" - already downloaded %s" % files_f + '/' + f['filename'])
|
|
|
|
if not headered:
|
|
index.append( ('<br /><b>Files</b><br />') )
|
|
headered = 1
|
|
index.append( ('files/' + f['filename'], f['filename']) )
|
|
|
|
###
|
|
### PAGES
|
|
###
|
|
pages_f = course_folder + '/pages'
|
|
headered = 0
|
|
image_count = 0
|
|
print("\nPAGES")
|
|
try:
|
|
os.mkdir(pages_f)
|
|
except:
|
|
print(" * Pages folder already exists.")
|
|
|
|
|
|
pages = fetch('/api/v1/courses/' + str(id) + '/pages', verbose)
|
|
for p in pages:
|
|
d(' - %s' % p['title'])
|
|
|
|
p['title'] = clean_title(p['title'])
|
|
easier_filename = clean_title(p['url'])
|
|
this_page_filename = "%s/%s.html" % (pages_f, easier_filename)
|
|
#for a in 'title,updated_at,published'.split(','):
|
|
# print(str(p[a]), "\t", end=' ')
|
|
|
|
if not headered:
|
|
index.append( ('<br /><b>Pages</b><br />') )
|
|
headered = 1
|
|
index.append( ( 'pages/' + easier_filename + '.html', p['title'] ) )
|
|
|
|
|
|
if os.path.exists(this_page_filename):
|
|
d(" - already downloaded %s" % this_page_filename)
|
|
this_page_content = open(this_page_filename,'r').read()
|
|
elif re.search(r'eis-prod',p['url']) or re.search(r'gavilan\.ins',p['url']):
|
|
d(' * skipping file behind passwords')
|
|
else:
|
|
t2 = fetch('/api/v1/courses/' + str(id) + '/pages/'+p['url'], verbose)
|
|
if t2 and 'body' in t2 and t2['body']:
|
|
bb = bs(t2['body'],features="lxml")
|
|
a_links = bb.find_all('a')
|
|
for A in a_links:
|
|
if re.search( r'youtu', A['href']):
|
|
video_link_list.append( (A['href'], A.text, 'pages/'+easier_filename + ".html") )
|
|
|
|
|
|
page_images = bb.find_all('img')
|
|
for I in page_images:
|
|
d(' - %s' % I['src'])
|
|
if re.search(r'eis-prod',I['src']) or re.search(r'gavilan\.ins',I['src']):
|
|
d(' * skipping file behind passwords')
|
|
else:
|
|
try:
|
|
r = requests.get(I['src'],headers=header, stream=True)
|
|
mytype = r.headers['content-type']
|
|
#print("Response is type: " + str(mytype))
|
|
r_parts = mytype.split("/")
|
|
ending = r_parts[-1]
|
|
|
|
with open(pages_f + '/' + str(image_count) + "." + ending, 'wb') as fd:
|
|
for chunk in r.iter_content(chunk_size=128):
|
|
fd.write(chunk)
|
|
image_count += 1
|
|
except Exception as e:
|
|
d( ' * Error downloading page image, %s' % str(e) )
|
|
|
|
try:
|
|
with codecs.open(this_page_filename, 'w','utf-8') as fd:
|
|
this_page_content = "<h2>%s</h2>\n%s" % ( t2['title'], t2['body'] )
|
|
fd.write(this_page_content)
|
|
except:
|
|
d(' * problem writing page content')
|
|
## TODO include linked pages even if they aren't in module
|
|
else:
|
|
d(' * nothing returned or bad fetch')
|
|
# write to running log of content in order of module
|
|
if p and p['url'] in item_id_to_index:
|
|
items[ item_id_to_index[ p['url'] ] ] = this_page_content +'\n\n'+pagebreak
|
|
else:
|
|
d(' -- This page didnt seem to be in the modules list.')
|
|
|
|
|
|
###
|
|
### ASSIGNMENTS
|
|
###
|
|
headered = 0
|
|
asm_f = course_folder + '/assignments'
|
|
print("\nASSIGNMENTS")
|
|
try:
|
|
os.mkdir(asm_f)
|
|
except:
|
|
d(" - Assignments dir exists")
|
|
|
|
asm = fetch('/api/v1/courses/' + str(id) + '/assignments', verbose)
|
|
for p in asm:
|
|
d(' - %s' % p['name'])
|
|
|
|
|
|
try:
|
|
friendlyfile = to_file_friendly(p['name'])
|
|
this_assmt_filename = asm_f + '/' + str(p['id'])+"_"+ friendlyfile + '.html'
|
|
if os.path.exists(this_assmt_filename):
|
|
d(" - already downloaded %s" % this_assmt_filename)
|
|
this_assmt_content = open(this_assmt_filename,'r').read()
|
|
else:
|
|
t2 = fetch('/api/v1/courses/' + str(id) + '/assignments/'+str(p['id']), verbose)
|
|
with codecs.open(this_assmt_filename, 'w','utf-8') as fd:
|
|
this_assmt_content = "<h2>%s</h2>\n%s\n\n" % (t2['name'], t2['description'])
|
|
fd.write(this_assmt_content)
|
|
if not headered:
|
|
index.append( ('<br /><b>Assignments</b><br />') )
|
|
headered = 1
|
|
index.append( ('assignments/' + str(p['id'])+"_"+friendlyfile + '.html', p['name']) )
|
|
|
|
# write to running log of content in order of module
|
|
if p['id'] in item_id_to_index:
|
|
items[ item_id_to_index[ p['url'] ] ] = this_assmt_content+'\n\n'+pagebreak
|
|
except Exception as e:
|
|
d(' * Problem %s' % str(e))
|
|
|
|
###
|
|
### FORUMS
|
|
###
|
|
"""forum_f = course_folder + '/forums'
|
|
headered = 0
|
|
image_count = 0
|
|
print("\nFORUMS")
|
|
try:
|
|
os.mkdir(forum_f)
|
|
forums = fetch('/api/v1/courses/' + str(id) + '/discussion_topics', verbose)
|
|
for p in forums:
|
|
p['title'] = clean_title(p['title'])
|
|
forum_id = p['id']
|
|
easier_filename = p['title']
|
|
for a in 'title,posted_at,published'.split(','):
|
|
print(str(p[a]), "\t", end=' ')
|
|
print("")
|
|
t2 = fetch('/api/v1/courses/' + str(id) + '/discussion_topics/'+str(forum_id), verbose)
|
|
|
|
|
|
#### REMOVED
|
|
bb = bs(t2['body'],features="lxml")
|
|
print("IMAGES IN THIS PAGE")
|
|
page_images = bb.find_all('img')
|
|
for I in page_images:
|
|
r = requests.get(I['src'],headers=header, stream=True)
|
|
mytype = r.headers['content-type']
|
|
print("Response is type: " + str(mytype))
|
|
r_parts = mytype.split("/")
|
|
ending = r_parts[-1]
|
|
|
|
with open(pages_f + '/' + str(image_count) + "." + ending, 'wb') as fd:
|
|
for chunk in r.iter_content(chunk_size=128):
|
|
fd.write(chunk)
|
|
image_count += 1
|
|
#### END REMOVED
|
|
|
|
try:
|
|
with codecs.open(forum_f + '/' + easier_filename + '.html', 'w','utf-8') as fd:
|
|
fd.write("<h1>"+t2['title']+"</h1>\n")
|
|
fd.write(t2['message'])
|
|
if not headered: index.append( ('<br /><b>Discussion Forums</b><br />') )
|
|
headered = 1
|
|
index.append( ( 'forums/' + easier_filename + '.html', p['title'] ) )
|
|
|
|
# write to running log of content in order of module
|
|
if p['id'] in item_id_to_index:
|
|
items_inorder[ item_id_to_index[ p['id'] ] ] = '<h1>'+t2['title']+'</h1>\n\n'+t2['message']+'\n\n'+pagebreak
|
|
else:
|
|
print(' This forum didnt seem to be in the modules list.')
|
|
except Exception as e:
|
|
print("Error here:", e)
|
|
#print p
|
|
#print results_dict
|
|
except Exception as e:
|
|
print("** Forum folder seems to exist. Skipping those.")
|
|
print(e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
###
|
|
### QUIZZES
|
|
###
|
|
|
|
|
|
# get a list external urls
|
|
headered = 0
|
|
t = url + '/api/v1/courses/' + str(id) + '/modules'
|
|
while t: t = fetch(t)
|
|
mods = results
|
|
results = []
|
|
for m in mods:
|
|
results = []
|
|
t2 = url + '/api/v1/courses/' + str(id) + '/modules/' + str(m['id']) + '/items'
|
|
while t2: t2 = fetch(t2)
|
|
items = results
|
|
for i in items:
|
|
#print i
|
|
if i['type'] == "ExternalUrl":
|
|
#print i
|
|
for j in 'id,title,external_url'.split(','):
|
|
print unicode(i[j]), "\t",
|
|
print ""
|
|
if not headered: index.append( ('<br /><b>External Links</b><br />') )
|
|
headered = 1
|
|
index.append( (i['external_url'], i['title']) )
|
|
"""
|
|
|
|
|
|
|
|
# Create index page of all gathered items
|
|
myindex = codecs.open(course_folder+'/index.html','w','utf-8')
|
|
for i in index:
|
|
if len(i)==2: myindex.write("<a href='"+i[0]+"'>"+i[1]+"</a><br />\n")
|
|
else: myindex.write(i)
|
|
|
|
|
|
|
|
# Full course content in single file
|
|
print("Writing main course files...")
|
|
mycourse = codecs.open(course_folder+'/fullcourse.raw.html','w','utf-8')
|
|
|
|
for I in items:
|
|
if I:
|
|
mycourse.write( I )
|
|
|
|
|
|
|
|
temp = open('cache/coursedump.txt','w')
|
|
temp.write( "items: " + json.dumps(items,indent=2) )
|
|
temp.write("\n\n\n")
|
|
temp.write( "index: " + json.dumps(index,indent=2) )
|
|
temp.write("\n\n\n")
|
|
temp.write( "items_inorder: " + json.dumps(items_inorder,indent=2) )
|
|
temp.write("\n\n\n")
|
|
temp.write( "item_id_to_index: " + json.dumps(item_id_to_index,indent=2) )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if video_link_list:
|
|
mycourse.write('\n<h1>Videos Linked in Pages</h1>\n<table>')
|
|
for V in video_link_list:
|
|
(url, txt, pg) = V
|
|
mycourse.write("<tr><td><a target='_blank' href='"+url+"'>"+txt+"</a></td><td> on <a target='_blank' href='" + pg + "'>" + pg + "</a></td></tr>\n")
|
|
mycourse.write("</table>\n")
|
|
|
|
mycourse.close()
|
|
output = pypandoc.convert_file(course_folder+'/fullcourse.raw.html', 'html', outputfile=course_folder+"/fullcourse.html")
|
|
output1 = pypandoc.convert_file(course_folder+'/fullcourse.html', 'md', outputfile=course_folder+"/fullcourse.md")
|
|
output2 = pypandoc.convert_file(course_folder+'/fullcourse.html', 'docx', outputfile=course_folder+"/fullcourse.docx")
|
|
|
|
|
|
def pan_testing():
|
|
course_folder = '../course_temps/course_6862'
|
|
output3 = pypandoc.convert_file(course_folder+'/fullcourse.md', 'html', outputfile=course_folder+"/fullcourse.v2.html")
|
|
|
|
# Given course, page url, and new content, upload the new revision of a page
|
|
def create_page(course_num,new_title,new_content):
|
|
t3 = url + '/api/v1/courses/' + str(course_num) + '/pages'
|
|
#xyz = raw_input('Enter 1 to continue and send back to: ' + t3 + ': ')
|
|
#print("Creating page: %s\nwith content:%s\n\n\n" % (new_title,new_content))
|
|
print("Creating page: %s" % new_title)
|
|
xyz = input('type 1 to confirm: ') #'1'
|
|
if xyz=='1':
|
|
data = {'wiki_page[title]':new_title, 'wiki_page[body]':new_content}
|
|
r3 = requests.post(t3, headers=header, params=data)
|
|
print(r3)
|
|
print('ok')
|
|
|
|
|
|
def md_to_course():
|
|
#input = 'C:/Users/peter/Nextcloud/Documents/gavilan/student_orientation.txt'
|
|
#output = 'C:/Users/peter/Nextcloud/Documents/gavilan/stu_orientation/student_orientation.html'
|
|
id = "11214"
|
|
infile = 'cache/pages/course_%s.md' % id
|
|
output = 'cache/pages/course_%s_fixed.html' % id
|
|
output3 = pypandoc.convert_file(infile, 'html', format='md', outputfile=output)
|
|
|
|
xx = codecs.open(output,'r','utf-8').read()
|
|
soup = bs( xx, features="lxml" )
|
|
soup.encode("utf-8")
|
|
|
|
current_page = ""
|
|
current_title = ""
|
|
|
|
for child in soup.body.children:
|
|
if child.name == "h1" and not current_title:
|
|
current_title = child.get_text()
|
|
elif child.name == "h1":
|
|
upload_page(id,current_title,current_page)
|
|
current_title = child.get_text()
|
|
current_page = ""
|
|
print( "Next page: %s" % current_title )
|
|
else:
|
|
#print(dir(child))
|
|
if 'prettify' in dir(child):
|
|
current_page += child.prettify(formatter="html")
|
|
else:
|
|
current_page += child.string
|
|
|
|
upload_page(id,current_title,current_page)
|
|
print("Done")
|
|
|
|
|
|
# DL pages only
|
|
def grab_course_pages(course_num=-1):
|
|
global results, results_dict, url, header
|
|
# course_num = raw_input("What is the course id? ")
|
|
if course_num<0:
|
|
course_num = input("Id of course? ")
|
|
else:
|
|
course_num = str(course_num)
|
|
modpagelist = []
|
|
modurllist = []
|
|
# We want things in the order of the modules
|
|
t4 = url + '/api/v1/courses/'+str(course_num)+'/modules?include[]=items'
|
|
results = fetch(t4)
|
|
i = 1
|
|
pageout = codecs.open('cache/pages/course_'+str(course_num)+'.html','w','utf-8')
|
|
pageoutm = codecs.open('cache/pages/course_'+str(course_num)+'.md','w','utf-8')
|
|
divider = "\n### "
|
|
for M in results:
|
|
print("Module Name: " + M['name'])
|
|
for I in M['items']:
|
|
if I['type']=='Page':
|
|
modpagelist.append(I['title'])
|
|
modurllist.append(I['page_url'])
|
|
pageout.write(divider+I['title']+'### '+I['page_url']+'\n')
|
|
easier_filename = clean_title(I['page_url'])
|
|
print(" " + str(i) + ". " + I['title'])
|
|
t2 = url + '/api/v1/courses/' + str(course_num) + '/pages/'+I['page_url']
|
|
print('Getting: ' + t2)
|
|
mypage = fetch(t2)
|
|
fixed = checker.safe_html(mypage['body'])
|
|
if fixed:
|
|
#markdown = h2m.convert(fixed)
|
|
#p_data = pandoc.read(mypage['body'])
|
|
markdown = pypandoc.convert_text("\n<h1>" + I['title'] + "</h1>\n" + mypage['body'], 'md', format='html')
|
|
pageout.write(fixed+'\n')
|
|
pageoutm.write(markdown+'\n')
|
|
pageout.flush()
|
|
i += 1
|
|
pageout.close()
|
|
pageoutm.close()
|
|
|
|
# Upload pages. Local copy has a particular format.
|
|
# Appears to not be used
|
|
def put_course_pages():
|
|
course_num = '6862'
|
|
filein = codecs.open('cache/pages/course_'+str(course_num)+'.html','r','utf-8')
|
|
my_titles = []
|
|
my_urls = []
|
|
my_bodys = []
|
|
started = 0
|
|
current_body = ""
|
|
for L in filein.readlines():
|
|
ma = re.search('^###\s(.*)###\s(.*)$',L)
|
|
if ma:
|
|
my_titles.append(ma.group(1))
|
|
my_urls.append(ma.group(2))
|
|
if started:
|
|
my_bodys.append(current_body)
|
|
current_body = ""
|
|
started = 1
|
|
else:
|
|
current_body += "\n" + L
|
|
my_bodys.append(current_body)
|
|
|
|
i = 0
|
|
for U in my_urls:
|
|
# and now upload it....lol
|
|
upload_page(course_num,U,my_bodys[i])
|
|
i += 1
|
|
|
|
# Also not used
|
|
def put_revised_pages():
|
|
course_num = '6862'
|
|
course_folder = '../course_temps/course_6862'
|
|
filein = codecs.open(course_folder+'/fullcourse.v2.html','r','utf-8')
|
|
my_titles = []
|
|
my_urls = []
|
|
my_bodys = []
|
|
started = 0
|
|
current_body = ""
|
|
for L in filein.readlines():
|
|
ma = re.search('^<h1>(.*)</h1>.*$',L)
|
|
if ma:
|
|
my_titles.append(ma.group(1))
|
|
my_urls.append(ma.group(2))
|
|
if started:
|
|
my_bodys.append(current_body)
|
|
current_body = ""
|
|
started = 1
|
|
else:
|
|
current_body += "\n" + L
|
|
my_bodys.append(current_body)
|
|
|
|
i = 0
|
|
for U in my_urls:
|
|
# and now upload it....lol
|
|
upload_page(course_num,U,my_bodys[i])
|
|
i += 1
|
|
|
|
# Download, clean html, and reupload page
|
|
def update_page():
|
|
global results, results_dict, url, header
|
|
# course_num = raw_input("What is the course id? ")
|
|
course_num = '6862'
|
|
t = url + '/api/v1/courses/' + str(course_num) + '/pages'
|
|
while t: t = fetch(t)
|
|
pages = results
|
|
results = []
|
|
mypagelist = []
|
|
myurllist = []
|
|
modpagelist = []
|
|
modurllist = []
|
|
for p in pages:
|
|
p['title'] = clean_title(p['title'])
|
|
mypagelist.append(p['title'])
|
|
myurllist.append(p['url'])
|
|
easier_filename = clean_title(p['url'])
|
|
#for a in 'title,updated_at,published'.split(','):
|
|
# print unicode(p[a]), "\t",
|
|
#print ""
|
|
|
|
# We want things in the order of the modules
|
|
t4 = url + '/api/v1/courses/'+str(course_num)+'/modules?include[]=items'
|
|
while t4: t4 = fetch(t4)
|
|
mods = results
|
|
results = []
|
|
i = 1
|
|
print("\nWhat page do you want to repair?")
|
|
for M in mods:
|
|
print("Module Name: " + M['name'])
|
|
for I in M['items']:
|
|
if I['type']=='Page':
|
|
modpagelist.append(I['title'])
|
|
modurllist.append(I['page_url'])
|
|
print(" " + str(i) + ". " + I['title'])
|
|
i += 1
|
|
|
|
choice = input("\n> ")
|
|
choice = int(choice) - 1
|
|
chosen_url = modurllist[choice]
|
|
print('Fetching: ' + modpagelist[choice])
|
|
t2 = url + '/api/v1/courses/' + str(course_num) + '/pages/'+chosen_url
|
|
print('From: ' + t2)
|
|
|
|
results_dict = {}
|
|
while(t2): t2 = fetch(t2)
|
|
mypage = results_dict
|
|
fixed_page = checker.safe_html(mypage['body'])
|
|
upload_page(course_num,chosen_url,fixed_page)
|
|
|
|
# Given course, page url, and new content, upload the new revision of a page
|
|
def upload_page(course_num,pageurl,new_content):
|
|
print("Repaired page:\n\n")
|
|
#print new_content
|
|
print(pageurl)
|
|
t3 = url + '/api/v1/courses/' + str(course_num) + '/pages/' + pageurl
|
|
xyz = input('Enter 1 to continue and send back to: ' + t3 + ': ')
|
|
#xyz = '1'
|
|
if xyz=='1':
|
|
data = {'wiki_page[body]':new_content}
|
|
r3 = requests.put(t3, headers=header, params=data)
|
|
print(r3)
|
|
print('ok')
|
|
|
|
# Use template to build html page with homegrown subtitles
|
|
def build_srt_embed_php(data):
|
|
template = codecs.open('template_srt_and_video.txt','r','utf-8').readlines()
|
|
result = ''
|
|
for L in template:
|
|
L = re.sub('FRAMEID',data['frameid'],L)
|
|
L = re.sub('TITLE',data['title'],L)
|
|
L = re.sub('EMBEDLINK',data['embedlink'],L)
|
|
L = re.sub('SRTFOLDERFILE',data['srtfolderfile'],L)
|
|
result += L
|
|
return result
|
|
|
|
|
|
|
|
|
|
def yt_title(code):
|
|
global saved_titles
|
|
if code in saved_titles:
|
|
return saved_titles[code]
|
|
a = requests.get('https://www.youtube.com/watch?v=%s' % code)
|
|
bbb = bs(a.content,"lxml")
|
|
ccc = bbb.find('title').text
|
|
ccc = re.sub(r'\s\-\sYouTube','',ccc)
|
|
saved_titles[code] = ccc
|
|
codecs.open('saved_youtube_titles.json','w','utf-8').write(json.dumps(saved_titles))
|
|
return ccc
|
|
|
|
def swap_youtube_subtitles():
|
|
# example here: http://siloor.github.io/youtube.external.subtitle/examples/srt/
|
|
|
|
# srt folder, look at all filenames
|
|
srtlist = os.listdir('video_srt')
|
|
i = 0
|
|
for V in srtlist:
|
|
print(str(i) + '. ' + V)
|
|
i += 1
|
|
choice = input("Which SRT folder? ")
|
|
choice = srtlist[int(choice)]
|
|
srt_folder = 'video_srt/'+choice
|
|
class_srt_folder = choice
|
|
srt_files = os.listdir(srt_folder)
|
|
srt_shorts = {}
|
|
print("\nThese are the subtitle files: " + str(srt_files))
|
|
for V in srt_files:
|
|
if V.endswith('srt'):
|
|
V1 = re.sub(r'(\.\w+$)','',V)
|
|
srt_shorts[V] = minimal_string(V1)
|
|
|
|
crs_id = input("What is the id of the course? ")
|
|
grab_course_pages(crs_id)
|
|
v1_pages = codecs.open('page_revisions/course_'+str(crs_id)+'.html','r','utf-8')
|
|
v1_content = v1_pages.read()
|
|
|
|
# a temporary page of all youtube links
|
|
tp = codecs.open('page_revisions/links_' + str(crs_id) + '.html', 'w','utf-8')
|
|
|
|
# course pages, get them all and look for youtube embeds
|
|
title_shorts = {}
|
|
title_embedlink = {}
|
|
title_list = []
|
|
print("I'm looking for iframes and youtube links.")
|
|
for L in v1_content.split('\n'):
|
|
if re.search('<a.*?href="https:\/\/youtu',L):
|
|
print("Possibly there's a linked video instead of embedded:" + L)
|
|
if re.search('iframe',L):
|
|
ma = re.compile('(\w+)=(".*?")')
|
|
#print "\n"
|
|
this_title = ''
|
|
for g in ma.findall(L):
|
|
print(g)
|
|
if g[0]=='title':
|
|
this_title = g[1].replace('"','')
|
|
if g[0]=='src':
|
|
this_src = g[1].replace('"','')
|
|
#print g
|
|
if not this_title:
|
|
tmp = re.search(r'embed\/(.*?)\?',this_src)
|
|
if not tmp: tmp = re.search(r'embed\/(.*?)$',this_src)
|
|
if tmp:
|
|
this_title = yt_title(tmp.groups()[0])
|
|
title_shorts[this_title] = minimal_string(this_title)
|
|
title_list.append(this_title)
|
|
title_embedlink[this_title] = this_src
|
|
print("%s\n" % this_title.encode('ascii','ignore'))
|
|
tp.write( "%s<br><a target='_blank' href='%s'>%s</a><br /><br />" % (this_title, this_src, this_src) )
|
|
# match them
|
|
# lowercase, non alpha or num chars become a single space, try to match
|
|
# if any srts remain unmatched, ask.
|
|
tp.close()
|
|
webbrowser.open_new_tab('file://C:/SCRIPTS/everything-json/page_revisions/links_'+str(crs_id)+'.html')
|
|
|
|
matches = {} # key is Title, value is srt file
|
|
for S,v in list(srt_shorts.items()):
|
|
found_match = 0
|
|
print(v, end=' ')
|
|
for T, Tv in list(title_shorts.items()):
|
|
if v == Tv:
|
|
print(' \tMatches: ' + T, end=' ')
|
|
found_match = 1
|
|
matches[T] = S
|
|
break
|
|
#print "\n"
|
|
|
|
print("\nThese are the srt files: ")
|
|
print(json.dumps(srt_shorts,indent=2))
|
|
print("\nThese are the titles: ")
|
|
print(json.dumps(title_shorts,indent=2))
|
|
print("\nThese are the matches: ")
|
|
print(json.dumps(matches,indent=2))
|
|
|
|
print(("There are %d SRT files and %d VIDEOS found. " % ( len(list(srt_shorts.keys())), len(list(title_shorts.keys())) ) ))
|
|
|
|
for S,v in list(srt_shorts.items()):
|
|
if not S in list(matches.values()):
|
|
print("\nDidn't find a match for: " + S)
|
|
i = 0
|
|
for T in title_list:
|
|
if not T in list(matches.keys()): print(str(i+1) + ". " + T.encode('ascii', 'ignore'))
|
|
i += 1
|
|
print("Here's the first few lines of the SRT:")
|
|
print(( re.sub(r'\s+',' ', '\n'.join(open(srt_folder+"/"+S,'r').readlines()[0:10]))+"\n\n"))
|
|
choice = input("Which one should I match it to? (zero for no match) ")
|
|
if int(choice)>0:
|
|
matches[ title_list[ int(choice)-1 ] ] = S
|
|
print("SRT clean name was: %s, and TITLE clean name was: %s" % (v,title_shorts[title_list[ int(choice)-1 ]] ))
|
|
print("ok, here are the matches:")
|
|
print(json.dumps(matches,indent=2))
|
|
|
|
# construct subsidiary pages, upload them
|
|
i = 0
|
|
for m,v in list(matches.items()):
|
|
# open template
|
|
# do replacement
|
|
i += 1
|
|
data = {'frameid':'videoframe'+str(i), 'title':m, 'embedlink':title_embedlink[m], 'srtfolderfile':v }
|
|
print(json.dumps(data,indent=2))
|
|
file_part = v.split('.')[0]
|
|
new_php = codecs.open(srt_folder + '/' + file_part + '.php','w','utf-8')
|
|
new_php.write(build_srt_embed_php(data))
|
|
new_php.close()
|
|
#srt_files = os.listdir(srt_folder)
|
|
put_file(class_srt_folder)
|
|
|
|
|
|
def test_swap():
|
|
crs_id = '6923'
|
|
# swap in embed code and re-upload canvas pages
|
|
v2_pages = codecs.open('page_revisions/course_'+str(crs_id)+'.html','r','utf-8')
|
|
v2_content = v2_pages.read()
|
|
ma = re.compile('(\w+)=(".*?")')
|
|
|
|
for L in v2_content.split('\n'):
|
|
find = re.findall('<iframe(.*?)>',L)
|
|
if find:
|
|
print("Found: ", find)
|
|
for each in find:
|
|
#print "\n"
|
|
this_title = ''
|
|
this_src = ''
|
|
for g in ma.findall(each):
|
|
#print g
|
|
if g[0]=='title':
|
|
this_title = g[1].replace('"','')
|
|
if g[0]=='src':
|
|
this_src = g[1].replace('"','')
|
|
#print g
|
|
if not this_title:
|
|
tmp = re.search(r'embed\/(.*?)\?',this_src)
|
|
if not tmp: tmp = re.search(r'embed\/(.*?)$',this_src)
|
|
if tmp:
|
|
this_title = yt_title(tmp.groups()[0])
|
|
print("Found embed link: %s\n and title: %s\n" % (this_src,this_title.encode('ascii','ignore')))
|
|
|
|
|
|
def multiple_downloads():
|
|
|
|
x = input("What IDs? Separate with one space: ")
|
|
for id in x.split(" "):
|
|
accessible_check(id)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
print ('')
|
|
options = { 1: ['download a class into a folder / word file', accessible_check] ,
|
|
2: ['download multiple classes', multiple_downloads ],
|
|
3: ['convert stuff', pan_testing ],
|
|
4: ['convert md to html', md_to_course ],
|
|
5: ['import freshdesk content', freshdesk ],
|
|
6: ['download all a courses pages', grab_course_pages],
|
|
}
|
|
|
|
for key in options:
|
|
print(str(key) + '.\t' + options[key][0])
|
|
|
|
print('')
|
|
resp = input('Choose: ')
|
|
|
|
# Call the function in the options dict
|
|
options[ int(resp)][1]()
|
|
|
|
|
|
|