1571 lines
58 KiB
Python
1571 lines
58 KiB
Python
from __future__ import annotations
|
|
|
|
#saved_titles = json.loads( codecs.open('cache/saved_youtube_titles.json','r','utf-8').read() )
|
|
|
|
#from calendar import FRIDAY
|
|
#import html2markdown as h2m
|
|
|
|
from typing import ItemsView
|
|
import requests, codecs, os, re, json, sys, pypandoc
|
|
from checker import safe_html
|
|
from pipelines import header, fetch, url
|
|
from util import clean_title, to_file_friendly
|
|
from bs4 import BeautifulSoup as bs
|
|
from html.parser import HTMLParser
|
|
|
|
|
|
pagebreak = '\n\n<!-- BREAK -->\n\n<div style="page-break-before: always;"></div>\n\n'
|
|
DBG = 1
|
|
|
|
items = []
|
|
|
|
def d(s):
|
|
global DBG
|
|
if DBG: print(s)
|
|
|
|
def test_forums(id=0):
|
|
if not id:
|
|
id = input("ID of course to check? ")
|
|
verbose = 1
|
|
|
|
courseinfo = fetch('/api/v1/courses/' + str(id), verbose )
|
|
|
|
item_id_to_index = {}
|
|
items_inorder = ["<font size='24'>" + courseinfo['name'] + "</font>\n\n" + pagebreak,]
|
|
running_index = 1
|
|
|
|
modules = fetch('/api/v1/courses/' + str(id) + '/modules',verbose)
|
|
|
|
items = []
|
|
for x in range(9000): items.append(0)
|
|
|
|
for m in modules:
|
|
items[running_index] = '<h2>%s</h2>%s\n' % ( m['name'], pagebreak )
|
|
running_index += 1
|
|
|
|
mod_items = fetch('/api/v1/courses/' + str(id) + '/modules/'+str(m['id'])+'/items', verbose)
|
|
|
|
for I in mod_items:
|
|
|
|
if I['type'] in ['SubHeader', 'Page', 'Quiz', 'Discussion', 'ExternalUrl' ] or 'content_id' in I:
|
|
running_index += 1
|
|
|
|
if I['type'] == 'SubHeader':
|
|
#print('subheader: ' + str(I))
|
|
items[running_index] = '<h3>%s</h3>\n' % str(json.dumps(I,indent=2))
|
|
|
|
if I['type'] == 'Page':
|
|
item_id_to_index[ I['page_url'] ] = running_index
|
|
|
|
if I['type'] == 'Quiz':
|
|
item_id_to_index[ I['content_id'] ] = running_index
|
|
|
|
if I['type'] == 'Discussion':
|
|
item_id_to_index[ I['content_id'] ] = running_index
|
|
|
|
if I['type'] == 'ExternalUrl':
|
|
items[running_index] = "<a href='%s'>%s</a><br />\n\n" % (I['external_url'], I['title'])
|
|
|
|
# ?
|
|
#if 'content_id' in I:
|
|
# item_id_to_index[ I['content_id'] ] = running_index
|
|
else:
|
|
print("What is this item? " + str(I))
|
|
|
|
|
|
#items_inorder.append('<i>Not included: '+ I['title'] + '(a ' + I['type'] + ')</i>\n\n\n' )
|
|
|
|
# I['title']
|
|
# I['content_id']
|
|
# I['page_url']
|
|
# I['type']
|
|
# I['published']
|
|
# assignments and files have content_id, pages have page_url
|
|
|
|
course_folder = '../course_temps/course_'+id
|
|
index = []
|
|
try:
|
|
os.mkdir(course_folder)
|
|
except:
|
|
print("Course folder exists.")
|
|
|
|
index.extend( extract_forums(id, course_folder, item_id_to_index, verbose) )
|
|
print(json.dumps(index,indent=2))
|
|
|
|
def write_message(fd, view, participants):
|
|
fd.write(f"<blockquote>\nfrom <b>{participants[view['user_id']]['display_name']}</b>:<br />\n{view['message']}\n<br />")
|
|
if 'replies' in view:
|
|
for r in view['replies']:
|
|
write_message(fd, r, participants)
|
|
fd.write("</blockquote>\n")
|
|
|
|
def extract_forums(id, course_folder, item_id_to_index, verbose=0):
|
|
###
|
|
### FORUMS
|
|
###
|
|
|
|
global items
|
|
|
|
index = []
|
|
forum_f = course_folder + '/forums'
|
|
headered = 0
|
|
image_count = 0
|
|
print("\nFORUMS")
|
|
try:
|
|
os.mkdir(forum_f)
|
|
forums = fetch('/api/v1/courses/' + str(id) + '/discussion_topics', verbose)
|
|
for p in forums:
|
|
p['title'] = clean_title(p['title'])
|
|
forum_id = p['id']
|
|
easier_filename = p['title']
|
|
for a in 'title,posted_at,published'.split(','):
|
|
print(str(p[a]), "\t", end=' ')
|
|
print("")
|
|
t2 = fetch(f"/api/v1/courses/{id}/discussion_topics/{forum_id}", verbose)
|
|
title = t2['title']
|
|
message = t2['message']
|
|
|
|
t2 = fetch(f"/api/v1/courses/{id}/discussion_topics/{forum_id}/view", verbose)
|
|
try:
|
|
participants = {x['id']:x for x in t2['participants']}
|
|
with codecs.open(forum_f + '/' + easier_filename + '.html', 'w','utf-8') as fd:
|
|
fd.write(f"<h1>{title}</h1>\n")
|
|
fd.write(message + "\n\n")
|
|
for v in t2['view']:
|
|
write_message(fd, v, participants)
|
|
if not headered: index.append( ('<br /><b>Discussion Forums</b><br />') )
|
|
headered = 1
|
|
index.append( ( 'forums/' + easier_filename + '.html', p['title'] ) )
|
|
|
|
|
|
# write to running log of content in order of module
|
|
if p['id'] in item_id_to_index:
|
|
items[ item_id_to_index[ p['id'] ] ] = f"<h1>{title}</h1>\n\n{message}\n\n{pagebreak}"
|
|
else:
|
|
print(' This forum didnt seem to be in the modules list.')
|
|
except Exception as e:
|
|
print("Error here:", e)
|
|
#print p
|
|
#print results_dict
|
|
except Exception as e:
|
|
print("** Forum folder seems to exist. Skipping those.")
|
|
print(e)
|
|
|
|
return index
|
|
|
|
|
|
|
|
#
|
|
#
|
|
#
|
|
#
|
|
#
|
|
# todo: include front page.
|
|
# todo: clean html
|
|
# todo: toc
|
|
#
|
|
#
|
|
# Download everything interesting in a course to a local folder
|
|
# Build a master file with the entire class content
|
|
def course_download(id=""):
|
|
global items
|
|
|
|
if not id:
|
|
id = input("ID of course to check? ")
|
|
# temp hard code
|
|
#id = "21284"
|
|
|
|
verbose = 0
|
|
PAGES_ONLY = 0
|
|
|
|
videos_log = codecs.open('cache/accessible_check_log.txt','w','utf-8')
|
|
|
|
save_file_types = ['application/pdf','application/docx','image/jpg','image/png','image/gif','image/webp','application/vnd.openxmlformats-officedocument.wordprocessingml.document']
|
|
|
|
courseinfo = fetch('/api/v1/courses/' + str(id), verbose )
|
|
|
|
# reverse lookup into items array
|
|
item_id_to_index = {}
|
|
|
|
|
|
modules = fetch('/api/v1/courses/' + str(id) + '/modules',verbose)
|
|
|
|
# headers / module names
|
|
items = [f"<h1>{courseinfo['name']}</h1>\n{pagebreak}",]
|
|
running_index = 1
|
|
for x in range(9000): items.append(0)
|
|
|
|
video_link_list = []
|
|
|
|
for m in modules:
|
|
items[running_index] = '<h2>%s</h2>%s\n' % ( m['name'], pagebreak )
|
|
running_index += 1
|
|
|
|
mod_items = fetch('/api/v1/courses/' + str(id) + '/modules/'+str(m['id'])+'/items', verbose)
|
|
|
|
for I in mod_items:
|
|
|
|
if I['type'] in ['SubHeader', 'Page', 'Quiz', 'Discussion', 'ExternalUrl' ] or 'content_id' in I:
|
|
running_index += 1
|
|
|
|
if I['type'] == 'SubHeader':
|
|
#print('subheader: ' + str(I))
|
|
items[running_index] = f"<h3>{I['title']}</h3>\n"
|
|
|
|
if I['type'] == 'Page':
|
|
item_id_to_index[ I['page_url'] ] = running_index
|
|
|
|
if I['type'] == 'Quiz':
|
|
item_id_to_index[ I['content_id'] ] = running_index
|
|
|
|
if I['type'] == 'Discussion':
|
|
item_id_to_index[ I['content_id'] ] = running_index
|
|
|
|
if I['type'] == 'ExternalUrl':
|
|
items[running_index] = "<a href='%s'>%s</a><br />\n\n" % (I['external_url'], I['title'])
|
|
|
|
# ?
|
|
#if 'content_id' in I:
|
|
# item_id_to_index[ I['content_id'] ] = running_index
|
|
else:
|
|
print("What is this item? " + str(I))
|
|
|
|
|
|
#items_inorder.append('<i>Not included: '+ I['title'] + '(a ' + I['type'] + ')</i>\n\n\n' )
|
|
|
|
# I['title']
|
|
# I['content_id']
|
|
# I['page_url']
|
|
# I['type']
|
|
# I['published']
|
|
# assignments and files have content_id, pages have page_url
|
|
|
|
course_folder = '../course_temps/course_'+id
|
|
|
|
# list of each item, organized by item type. Tuples of (url,title)
|
|
index = []
|
|
try:
|
|
os.mkdir(course_folder)
|
|
except:
|
|
print("Course folder exists.")
|
|
###
|
|
### FILES
|
|
###
|
|
if not PAGES_ONLY:
|
|
files_f = course_folder + '/files'
|
|
headered = 0
|
|
print("\nFILES")
|
|
try:
|
|
os.mkdir(files_f)
|
|
except:
|
|
print(" * Files folder already exists.")
|
|
|
|
files = fetch('/api/v1/courses/' + str(id) + '/files', verbose)
|
|
print("LISTING COURSE FILES")
|
|
for f in files:
|
|
for arg in 'filename,content-type,size,url'.split(','):
|
|
if arg=='size':
|
|
f['size'] = str(int(f['size']) / 1000) + 'k'
|
|
|
|
if f['content-type'] in save_file_types:
|
|
d(' - %s' % f['filename'])
|
|
|
|
if not os.path.exists(files_f + '/' + f['filename']):
|
|
r = requests.get(f['url'],headers=header, stream=True)
|
|
with open(files_f + '/' + f['filename'], 'wb') as fd:
|
|
for chunk in r.iter_content(chunk_size=128):
|
|
fd.write(chunk)
|
|
else:
|
|
d(" - already downloaded %s" % files_f + '/' + f['filename'])
|
|
|
|
if not headered:
|
|
index.append( ('<br /><b>Files</b><br />') )
|
|
headered = 1
|
|
index.append( ('files/' + f['filename'], f['filename']) )
|
|
|
|
###
|
|
### PAGES
|
|
###
|
|
pages_f = course_folder + '/pages'
|
|
headered = 0
|
|
image_count = 0
|
|
print("\nPAGES")
|
|
try:
|
|
os.mkdir(pages_f)
|
|
except:
|
|
print(" * Pages folder already exists.")
|
|
|
|
|
|
pages = fetch('/api/v1/courses/' + str(id) + '/pages', verbose)
|
|
for p in pages:
|
|
d(' - %s' % p['title'])
|
|
|
|
p['title'] = clean_title(p['title'])
|
|
easier_filename = clean_title(p['url'])
|
|
this_page_filename = "%s/%s.html" % (pages_f, easier_filename)
|
|
#for a in 'title,updated_at,published'.split(','):
|
|
# print(str(p[a]), "\t", end=' ')
|
|
|
|
if not headered:
|
|
index.append( ('<br /><b>Pages</b><br />') )
|
|
headered = 1
|
|
index.append( ( 'pages/' + easier_filename + '.html', p['title'] ) )
|
|
|
|
|
|
if os.path.exists(this_page_filename):
|
|
d(" - already downloaded %s" % this_page_filename)
|
|
this_page_content = codecs.open(this_page_filename,'r','utf-8').read()
|
|
#elif re.search(r'eis-prod',p['url']) or re.search(r'gavilan\.ins',p['url']):
|
|
#elif re.search(r'eis-prod',p['url']):
|
|
# d(' * skipping file behind passwords')
|
|
else:
|
|
t2 = fetch('/api/v1/courses/' + str(id) + '/pages/'+p['url'], verbose)
|
|
if t2 and 'body' in t2 and t2['body']:
|
|
soup_infolder = bs(t2['body'],features="lxml")
|
|
soup_in_main = bs(t2['body'],features="lxml")
|
|
a_links = soup_infolder.find_all('a')
|
|
for A in a_links:
|
|
href = A.get('href')
|
|
|
|
if href and re.search( r'youtu',href):
|
|
video_link_list.append( (A.get('href'), A.text, 'pages/'+easier_filename + ".html") )
|
|
|
|
# Images
|
|
page_images = soup_infolder.find_all('img')
|
|
page_image_paths = {}
|
|
for I in page_images:
|
|
src = I.get('src')
|
|
if src:
|
|
d(' - %s' % src)
|
|
try:
|
|
r = requests.get(src,headers=header, stream=True)
|
|
mytype = r.headers['content-type']
|
|
#print("Response is type: " + str(mytype))
|
|
r_parts = mytype.split("/")
|
|
ending = r_parts[-1]
|
|
|
|
if ending=='jpeg': ending = "jpg"
|
|
|
|
img_full_path = f"{pages_f}/{str(image_count)}.{ending}"
|
|
local_src = f"{str(image_count)}.{ending}"
|
|
page_image_paths[src] = f"pages/{local_src}"
|
|
I['src'] = local_src
|
|
|
|
with open(img_full_path, 'wb') as fd:
|
|
for chunk in r.iter_content(chunk_size=128):
|
|
fd.write(chunk)
|
|
image_count += 1
|
|
except Exception as e:
|
|
d( ' * Error downloading page image, %s' % str(e) )
|
|
|
|
# Repeat for version for main file
|
|
page_main_images = soup_in_main.find_all('img')
|
|
for I in page_main_images:
|
|
src = I.get('src')
|
|
if src:
|
|
I['src'] = page_image_paths[src]
|
|
|
|
|
|
# STUDIO VIDEOS
|
|
# Regex pattern to match "custom_arc_media_id%3D" and capture everything
|
|
# until the next '&' or end of string
|
|
pattern = r"custom_arc_media_id%3D([^&]+)"
|
|
found_ids = []
|
|
|
|
replacement_tag = '''<video width="480" height="320" controls="controls"><source src="http://serverIP_or_domain/location_of_video.mp4" type="video/mp4"></video>'''
|
|
|
|
# Iterate over all <iframe> tags
|
|
for iframe in soup_infolder.find_all("iframe"):
|
|
src = iframe.get("src")
|
|
if src:
|
|
# Search for the pattern in the src
|
|
match = re.search(pattern, src)
|
|
if match:
|
|
found_ids.append(match.group(1))
|
|
videos_log.write(f"page: {p['url']} iframe src: {src}\n")
|
|
videos_log.flush()
|
|
|
|
match2 = re.search('instructuremedia\.com', src)
|
|
if match2:
|
|
iframe_response = requests.get(src)
|
|
if iframe_response.status_code != 200:
|
|
print(f"Failed to retrieve iframe content from: {src}")
|
|
continue
|
|
videos_log.write(f"succesfully fetched {src}\n")
|
|
videos_log.flush()
|
|
|
|
# Step 4: Parse the iframes HTML
|
|
iframe_soup = bs(iframe_response.text, 'html.parser')
|
|
|
|
video_tag = iframe_soup.find('video')
|
|
if video_tag:
|
|
# Find the <source> tag(s) within the video
|
|
source_tags = video_tag.find_all('source')
|
|
# Extract each 'src' attribute
|
|
for source_tag in source_tags:
|
|
print("Video Source found:", source_tag.get('src'))
|
|
videos_log.write(f"page: {p['url']} video src: {source_tag.get('src')}\n")
|
|
videos_log.flush()
|
|
|
|
|
|
# WRITE out page
|
|
try:
|
|
this_page_content = f"<h2>{t2['title']}</h2>\n{soup_infolder.prettify()}"
|
|
with codecs.open(this_page_filename, 'w','utf-8') as fd:
|
|
fd.write(this_page_content)
|
|
except:
|
|
d(' * problem writing page content')
|
|
## TODO include linked pages even if they aren't in module
|
|
else:
|
|
d(' * nothing returned or bad fetch')
|
|
# write to running log of content in order of module
|
|
if p and p['url'] in item_id_to_index:
|
|
items[ item_id_to_index[ p['url'] ] ] = f"<h2>{t2['title']}</h2>\n{soup_in_main.prettify()}\n{pagebreak}"
|
|
else:
|
|
d(' -- This page didnt seem to be in the modules list.')
|
|
|
|
|
|
###
|
|
### ASSIGNMENTS
|
|
###
|
|
|
|
if not PAGES_ONLY:
|
|
headered = 0
|
|
asm_f = course_folder + '/assignments'
|
|
print("\nASSIGNMENTS")
|
|
try:
|
|
os.mkdir(asm_f)
|
|
except:
|
|
d(" - Assignments dir exists")
|
|
|
|
asm = fetch('/api/v1/courses/' + str(id) + '/assignments', verbose)
|
|
for p in asm:
|
|
d(' - %s' % p['name'])
|
|
|
|
|
|
try:
|
|
friendlyfile = to_file_friendly(p['name'])
|
|
this_assmt_filename = asm_f + '/' + str(p['id'])+"_"+ friendlyfile + '.html'
|
|
if os.path.exists(this_assmt_filename):
|
|
d(" - already downloaded %s" % this_assmt_filename)
|
|
this_assmt_content = open(this_assmt_filename,'r').read()
|
|
else:
|
|
t2 = fetch('/api/v1/courses/' + str(id) + '/assignments/'+str(p['id']), verbose)
|
|
with codecs.open(this_assmt_filename, 'w','utf-8') as fd:
|
|
this_assmt_content = "<h2>%s</h2>\n%s\n\n" % (t2['name'], t2['description'])
|
|
fd.write(this_assmt_content)
|
|
if not headered:
|
|
index.append( ('<br /><b>Assignments</b><br />') )
|
|
headered = 1
|
|
index.append( ('assignments/' + str(p['id'])+"_"+friendlyfile + '.html', p['name']) )
|
|
|
|
# write to running log of content in order of module
|
|
if p['id'] in item_id_to_index:
|
|
items[ item_id_to_index[ p['url'] ] ] = this_assmt_content+'\n\n'+pagebreak
|
|
except Exception as e:
|
|
d(' * Problem %s' % str(e))
|
|
|
|
###
|
|
### FORUMS
|
|
###
|
|
|
|
index.extend( extract_forums(id, course_folder, item_id_to_index, verbose) )
|
|
|
|
"""
|
|
|
|
|
|
|
|
###
|
|
### QUIZZES
|
|
###
|
|
|
|
|
|
# get a list external urls
|
|
headered = 0
|
|
t = url + '/api/v1/courses/' + str(id) + '/modules'
|
|
while t: t = fetch(t)
|
|
mods = results
|
|
results = []
|
|
for m in mods:
|
|
results = []
|
|
t2 = url + '/api/v1/courses/' + str(id) + '/modules/' + str(m['id']) + '/items'
|
|
while t2: t2 = fetch(t2)
|
|
items = results
|
|
for i in items:
|
|
#print i
|
|
if i['type'] == "ExternalUrl":
|
|
#print i
|
|
for j in 'id,title,external_url'.split(','):
|
|
print unicode(i[j]), "\t",
|
|
print ""
|
|
if not headered: index.append( ('<br /><b>External Links</b><br />') )
|
|
headered = 1
|
|
index.append( (i['external_url'], i['title']) )
|
|
"""
|
|
|
|
|
|
|
|
# Create index page of all gathered items
|
|
myindex = codecs.open(course_folder+'/index.html','w','utf-8')
|
|
for i in index:
|
|
if len(i)==2: myindex.write("<a href='"+i[0]+"'>"+i[1]+"</a><br />\n")
|
|
else: myindex.write(i)
|
|
|
|
|
|
|
|
# Full course content in single file
|
|
print("Writing main course files...")
|
|
mycourse = codecs.open(course_folder+'/fullcourse.raw.html','w','utf-8')
|
|
|
|
mycourse.write(f"<html><head><base href='file:///C:/Users/phowell/source/repos/course_temps/course_{id}/'></head><body>\n")
|
|
|
|
for I in items:
|
|
if I:
|
|
mycourse.write( I )
|
|
mycourse.write("\n</body></html>")
|
|
|
|
|
|
temp = open('cache/coursedump.txt','w')
|
|
temp.write( "items: " + json.dumps(items,indent=2) )
|
|
temp.write("\n\n\n")
|
|
temp.write( "index: " + json.dumps(index,indent=2) )
|
|
temp.write("\n\n\n")
|
|
#temp.write( "items_inorder: " + json.dumps(items_inorder,indent=2) )
|
|
#temp.write("\n\n\n")
|
|
temp.write( "item_id_to_index: " + json.dumps(item_id_to_index,indent=2) )
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if video_link_list:
|
|
mycourse.write('\n<h1>Videos Linked in Pages</h1>\n<table>')
|
|
for V in video_link_list:
|
|
(url, txt, pg) = V
|
|
mycourse.write("<tr><td><a target='_blank' href='"+url+"'>"+txt+"</a></td><td> on <a target='_blank' href='" + pg + "'>" + pg + "</a></td></tr>\n")
|
|
mycourse.write("</table>\n")
|
|
|
|
mycourse.close()
|
|
try:
|
|
pypandoc.convert_file(course_folder+'/fullcourse.raw.html', 'html', outputfile=course_folder+"/fullcourse.html")
|
|
except Exception as e:
|
|
print(f"couldn't create html fullcourse page: {e}")
|
|
try:
|
|
pypandoc.convert_file(course_folder+'/fullcourse.html', 'md', outputfile=course_folder+"/fullcourse.md")
|
|
except Exception as e:
|
|
print(f"couldn't create markdown fullcourse page: {e}")
|
|
try:
|
|
pypandoc.convert_file(course_folder+'/fullcourse.html', 'docx', outputfile=course_folder+"/fullcourse.docx")
|
|
except Exception as e:
|
|
print(f"couldn't create doc fullcourse page: {e}")
|
|
|
|
|
|
def media_testing():
|
|
user_id = 285 #ksmith
|
|
t = f"https://gavilan.instructuremedia.com/api/public/v1/users/{user_id}/media"
|
|
media = fetch(t,verbose=1,media=1)
|
|
print(media)
|
|
|
|
def pan_testing():
|
|
course_folder = '../course_temps/course_6862'
|
|
pypandoc.convert_file(course_folder+'/fullcourse.md', 'html', outputfile=course_folder+"/fullcourse.v2.html")
|
|
|
|
# Given course, page url, and new content, upload the new revision of a page
|
|
def create_page(course_num,new_title,new_content):
|
|
t3 = url + '/api/v1/courses/' + str(course_num) + '/pages'
|
|
#xyz = raw_input('Enter 1 to continue and send back to: ' + t3 + ': ')
|
|
#print("Creating page: %s\nwith content:%s\n\n\n" % (new_title,new_content))
|
|
print("Creating page: %s" % new_title)
|
|
xyz = input('type 1 to confirm: ') #'1'
|
|
if xyz=='1':
|
|
data = {'wiki_page[title]':new_title, 'wiki_page[body]':new_content}
|
|
r3 = requests.post(t3, headers=header, params=data)
|
|
print(r3)
|
|
print('ok')
|
|
|
|
|
|
def md_to_course():
|
|
#input = 'C:/Users/peter/Nextcloud/Documents/gavilan/student_orientation.txt'
|
|
#output = 'C:/Users/peter/Nextcloud/Documents/gavilan/stu_orientation/student_orientation.html'
|
|
id = "11214"
|
|
infile = 'cache/pages/course_%s.md' % id
|
|
output = 'cache/pages/course_%s_fixed.html' % id
|
|
output3 = pypandoc.convert_file(infile, 'html', format='md', outputfile=output)
|
|
|
|
xx = codecs.open(output,'r','utf-8').read()
|
|
soup = bs( xx, features="lxml" )
|
|
soup.encode("utf-8")
|
|
|
|
current_page = ""
|
|
current_title = ""
|
|
|
|
for child in soup.body.children:
|
|
if child.name == "h1" and not current_title:
|
|
current_title = child.get_text()
|
|
elif child.name == "h1":
|
|
upload_page(id,current_title,current_page)
|
|
current_title = child.get_text()
|
|
current_page = ""
|
|
print( "Next page: %s" % current_title )
|
|
else:
|
|
#print(dir(child))
|
|
if 'prettify' in dir(child):
|
|
current_page += child.prettify(formatter="html")
|
|
else:
|
|
current_page += child.string
|
|
|
|
upload_page(id,current_title,current_page)
|
|
print("Done")
|
|
|
|
|
|
# DL pages only
|
|
def grab_course_pages(course_num=-1):
|
|
global results, results_dict, url, header
|
|
# course_num = raw_input("What is the course id? ")
|
|
if course_num<0:
|
|
course_num = input("Id of course? ")
|
|
else:
|
|
course_num = str(course_num)
|
|
modpagelist = []
|
|
modurllist = []
|
|
# We want things in the order of the modules
|
|
t4 = url + '/api/v1/courses/'+str(course_num)+'/modules?include[]=items'
|
|
results = fetch(t4)
|
|
i = 1
|
|
pageout = codecs.open('cache/pages/course_'+str(course_num)+'.html','w','utf-8')
|
|
pageoutm = codecs.open('cache/pages/course_'+str(course_num)+'.md','w','utf-8')
|
|
divider = "\n### "
|
|
for M in results:
|
|
print("Module Name: " + M['name'])
|
|
for I in M['items']:
|
|
if I['type']=='Page':
|
|
modpagelist.append(I['title'])
|
|
modurllist.append(I['page_url'])
|
|
pageout.write(divider+I['title']+'### '+I['page_url']+'\n')
|
|
easier_filename = clean_title(I['page_url'])
|
|
print(" " + str(i) + ". " + I['title'])
|
|
t2 = url + '/api/v1/courses/' + str(course_num) + '/pages/'+I['page_url']
|
|
print('Getting: ' + t2)
|
|
mypage = fetch(t2)
|
|
fixed = safe_html(mypage['body'])
|
|
if fixed:
|
|
#markdown = h2m.convert(fixed)
|
|
#p_data = pandoc.read(mypage['body'])
|
|
markdown = pypandoc.convert_text("\n<h1>" + I['title'] + "</h1>\n" + mypage['body'], 'md', format='html')
|
|
pageout.write(fixed+'\n')
|
|
pageoutm.write(markdown+'\n')
|
|
pageout.flush()
|
|
i += 1
|
|
pageout.close()
|
|
pageoutm.close()
|
|
|
|
# Download, clean html, and reupload page
|
|
def update_page():
|
|
global results, results_dict, url, header
|
|
# course_num = raw_input("What is the course id? ")
|
|
course_num = '6862'
|
|
t = url + '/api/v1/courses/' + str(course_num) + '/pages'
|
|
while t: t = fetch(t)
|
|
pages = results
|
|
results = []
|
|
mypagelist = []
|
|
myurllist = []
|
|
modpagelist = []
|
|
modurllist = []
|
|
for p in pages:
|
|
p['title'] = clean_title(p['title'])
|
|
mypagelist.append(p['title'])
|
|
myurllist.append(p['url'])
|
|
easier_filename = clean_title(p['url'])
|
|
#for a in 'title,updated_at,published'.split(','):
|
|
# print unicode(p[a]), "\t",
|
|
#print ""
|
|
|
|
# We want things in the order of the modules
|
|
t4 = url + '/api/v1/courses/'+str(course_num)+'/modules?include[]=items'
|
|
while t4: t4 = fetch(t4)
|
|
mods = results
|
|
results = []
|
|
i = 1
|
|
print("\nWhat page do you want to repair?")
|
|
for M in mods:
|
|
print("Module Name: " + M['name'])
|
|
for I in M['items']:
|
|
if I['type']=='Page':
|
|
modpagelist.append(I['title'])
|
|
modurllist.append(I['page_url'])
|
|
print(" " + str(i) + ". " + I['title'])
|
|
i += 1
|
|
|
|
choice = input("\n> ")
|
|
choice = int(choice) - 1
|
|
chosen_url = modurllist[choice]
|
|
print('Fetching: ' + modpagelist[choice])
|
|
t2 = url + '/api/v1/courses/' + str(course_num) + '/pages/'+chosen_url
|
|
print('From: ' + t2)
|
|
|
|
results_dict = {}
|
|
while(t2): t2 = fetch(t2)
|
|
mypage = results_dict
|
|
fixed_page = safe_html(mypage['body'])
|
|
upload_page(course_num,chosen_url,fixed_page)
|
|
|
|
# given dict of file info (from files api), construct an img tag that works in a page
|
|
#def file_to_img_tag(f, alt, course, soup):
|
|
# #tag = f"<img id=\"\" src=\"https://ilearn.gavilan.edu/courses/{course}/files/{f['id']}/preview\" alt=\"{f['filename']}\" "
|
|
# #tag += f"data-api-endpoint=\"https://ilearn.gavilan.edu/api/v1/courses/{course}/files/{f['id']}\" data-api-returntype=\"File\" />"
|
|
# return T
|
|
|
|
|
|
def html_file_to_page(filename, course, tags):
|
|
|
|
try:
|
|
soup = bs(codecs.open(filename,'r', 'utf-8').read(), 'html.parser')
|
|
except Exception as e:
|
|
print(f"Exception on {filename}: {e}")
|
|
return
|
|
img_tags = soup.find_all('img')
|
|
|
|
result = {'title': soup.title.text if soup.title else ''}
|
|
result['title'].strip()
|
|
|
|
for img in img_tags:
|
|
src = img['src']
|
|
try:
|
|
alt = img['alt']
|
|
except:
|
|
alt = src
|
|
orig_filename = os.path.basename(src)
|
|
if orig_filename in tags:
|
|
T = soup.new_tag(name='img', src=f"https://ilearn.gavilan.edu/courses/{course}/files/{tags[orig_filename]['id']}/preview")
|
|
T['id'] = tags[orig_filename]['id']
|
|
T['alt'] = alt
|
|
T['data-api-endpoint'] = f"https://ilearn.gavilan.edu/api/v1/courses/{course}/files/{tags[orig_filename]['id']}"
|
|
T['data-api-returntype'] = "File"
|
|
img.replace_with(T)
|
|
print( f" replaced image: {src} alt: {alt}")
|
|
else:
|
|
print( f" couldn't find replacement image: {src} alt: {alt}")
|
|
outfile = codecs.open(filename+"_mod.html", 'w', 'utf-8')
|
|
outfile.write( soup.prettify() )
|
|
outfile.close()
|
|
result['body'] = ''.join(map(str, soup.body.contents)) if soup.body else ''
|
|
return result
|
|
|
|
def create_new_page(course_id, title, body):
|
|
print(f"Creating page: {title}, length: {len(body)}")
|
|
request = f"{url}/api/v1/courses/{course_id}/pages"
|
|
print(request)
|
|
data = { 'wiki_page[title]': title, 'wiki_page[body]': body }
|
|
r3 = requests.post(request, headers=header, data=data)
|
|
try:
|
|
result = json.loads(r3.text)
|
|
print( f" + ok: {result['url']}")
|
|
except:
|
|
print(" - problem creating page?")
|
|
|
|
# Given a folder full of html pages and their linked images, create Canvas PAGES of them
|
|
def make_pages_from_folder(folder='cache/csis6/', course = '20558'):
|
|
if 0:
|
|
request = f"{url}/api/v1/courses/{course}/files"
|
|
print("Fetching course files")
|
|
files = fetch(request)
|
|
|
|
tempfile = codecs.open('cache/csis6filelist.json','w','utf-8')
|
|
tempfile.write(json.dumps(files))
|
|
tempfile.close()
|
|
|
|
if 1:
|
|
files = json.loads( codecs.open('cache/csis6filelist.json', 'r', 'utf-8').read())
|
|
|
|
|
|
|
|
course_files = {f['filename']: f for f in files}
|
|
tags = {}
|
|
for f in files:
|
|
if f['filename'].lower().endswith('.jpg') or f['filename'].lower().endswith('.png'):
|
|
tags[f['filename']] = f
|
|
|
|
|
|
contents = os.listdir(folder)
|
|
contents = ['welcome.html','welcome2.html', 'welcome3.html']
|
|
print(contents)
|
|
for f in contents:
|
|
m = re.search(r'^(.*)\.(html?)$', f)
|
|
if m:
|
|
print(f"html file: {m.group(1)}, extension: {m.group(2)}")
|
|
newpage = html_file_to_page(folder+f, course, tags)
|
|
create_new_page(course, newpage['title'], newpage['body'])
|
|
else:
|
|
m = re.search(r'^(.*)\.(.*)$', f)
|
|
if m:
|
|
print(f"other file: {m.group(1)}, extension: {m.group(2)}")
|
|
else:
|
|
print(f"unknown file: {f}")
|
|
|
|
|
|
|
|
|
|
# Given course, page url, and new content, upload the new revision of a page
|
|
def upload_page(course_num,pageurl,new_content):
|
|
print("Repaired page:\n\n")
|
|
#print new_content
|
|
print(pageurl)
|
|
t3 = url + '/api/v1/courses/' + str(course_num) + '/pages/' + pageurl
|
|
xyz = input('Enter 1 to continue and send back to: ' + t3 + ': ')
|
|
#xyz = '1'
|
|
if xyz=='1':
|
|
data = {'wiki_page[body]':new_content}
|
|
r3 = requests.put(t3, headers=header, params=data)
|
|
print(r3)
|
|
print('ok')
|
|
|
|
|
|
|
|
def multiple_downloads():
|
|
|
|
x = input("What IDs? Separate with one space: ")
|
|
for id in x.split(" "):
|
|
course_download(id)
|
|
|
|
|
|
|
|
def fetch_support_page():
|
|
u = "https://ilearn.gavilan.edu/courses/20850/pages/online-student-support-hub"
|
|
course_num = 20850
|
|
page_url = "online-student-support-hub"
|
|
t2 = f"{url}/api/v1/courses/{course_num}/pages/{page_url}"
|
|
print('Getting: ' + t2)
|
|
mypage = fetch(t2)
|
|
print(json.dumps(mypage,indent=2))
|
|
print(mypage['body'])
|
|
|
|
|
|
from courses import getCoursesInTerm
|
|
|
|
def clear_old_page(shell_id,page_name):
|
|
# get all pages
|
|
t = f"{url}/api/v1/courses/{shell_id}/pages"
|
|
pages = fetch(t)
|
|
for page in pages:
|
|
if page['title'] == page_name:
|
|
print(f"found a page named {page_name}. Deleting it.")
|
|
id = page['page_id']
|
|
t2 = f"{url}/api/v1/courses/{shell_id}/pages/{id}"
|
|
r2 = requests.delete(t2, headers=header)
|
|
print(f"{r2}")
|
|
|
|
def add_support_page_full_semester(term=289):
|
|
print("Fetching list of all active courses")
|
|
# term = 184 # fa24 # 182
|
|
c = getCoursesInTerm(term,0,0) # sp25 = 287 wi24=182
|
|
|
|
#print(c)
|
|
|
|
check = 'each'
|
|
print("answer 'all' to do the rest without confirming")
|
|
|
|
for C in c:
|
|
if check == 'each':
|
|
answer = input(f"Type 1 <enter> to add support page to {C['id']} ({C['name']}) ")
|
|
if answer == '1':
|
|
create_support_page(C['id'])
|
|
else:
|
|
if answer == 'all':
|
|
check = 'all'
|
|
create_support_page(C['id'])
|
|
continue
|
|
elif check == 'all':
|
|
create_support_page(C['id'])
|
|
|
|
def create_support_page(shell_id=18297): # 29):
|
|
|
|
# clear one of same name first.
|
|
clear_old_page(shell_id, "Online Student Support Hub")
|
|
|
|
# make new one
|
|
t3 = f"{url}/api/v1/courses/{shell_id}/pages/online-student-support-hub"
|
|
new_content = codecs.open("cache/support_min.html","r","utf-8").read()
|
|
title = "Online Student Support Hub"
|
|
data = {'wiki_page[body]':new_content, 'wiki_page[title]':title, 'wiki_page[published]':"true"}
|
|
r3 = requests.put(t3, headers=header, params=data)
|
|
#print(r3.content)
|
|
|
|
print('Page Created')
|
|
try:
|
|
response = r3.json()
|
|
print(f"page id: {response['page_id']}")
|
|
except Exception as e:
|
|
print(f"Exception: {e}")
|
|
|
|
|
|
# list modules
|
|
# GET /api/v1/courses/:course_id/modules
|
|
t4 = f"{url}/api/v1/courses/{shell_id}/modules"
|
|
modules = fetch(t4)
|
|
module_id = 0
|
|
|
|
# what if there are no modules?
|
|
if len(modules) == 0:
|
|
t6 = f"{url}/api/v1/courses/{shell_id}/modules/"
|
|
mod_data = {'module[name]': 'Welcome', 'module[unlock_at]':"2024-01-01T06:00:00-08:00"}
|
|
r6 = requests.post(t6, headers=header, params=mod_data)
|
|
mod_response = r6.json()
|
|
module_id = mod_response['id']
|
|
print(f"created module, id: {module_id}")
|
|
|
|
# publish module
|
|
t7 = f"{url}/api/v1/courses/{shell_id}/modules/{module_id}"
|
|
mod_data2 = {'module[published]':'true'}
|
|
r6 = requests.put(t7, headers=header, params=mod_data2)
|
|
|
|
for M in modules:
|
|
if M['position'] == 1:
|
|
module_id = M['id']
|
|
print(f"found first module 1: ({module_id}) {M['name']}")
|
|
#print(json.dumps(modules,indent=2))
|
|
#
|
|
# create module item
|
|
# POST /api/v1/courses/:course_id/modules/:module_id/items
|
|
t5 = f"{url}/api/v1/courses/{shell_id}/modules/{module_id}/items"
|
|
item_data = {'module_item[title]': title, 'module_item[type]': 'Page', 'module_item[page_url]': response['url'], 'module_item[position]':1}
|
|
r5 = requests.post(t5, headers=header, params=item_data)
|
|
|
|
print('ok')
|
|
|
|
def list_modules_and_items(shell_id, verbose=0):
|
|
modules = fetch(f"{url}/api/v1/courses/{shell_id}/modules?include[]=items&include[]=content_details")
|
|
if verbose: print(json.dumps(modules,indent=2))
|
|
return modules
|
|
|
|
def check_modules_for_old_orientation():
|
|
from util import contains_key_value, find_dict_with_key_value, extract_key_values
|
|
|
|
checklist = []
|
|
|
|
for term in [286, 287]: # wi25, sp25
|
|
|
|
print("Fetching list of all active courses")
|
|
#term = 287 # 184 # fa24 # 182
|
|
#term = 286 # wi25
|
|
c = getCoursesInTerm(term,0,0) # sp25 = 287 wi24=182
|
|
|
|
for C in c:
|
|
print(f"{C['id']} - {C['name']}")
|
|
m = list_modules_and_items(C['id'])
|
|
|
|
if contains_key_value(m, 'name', 'Online Student Support Services - Summer & Fall 2024'):
|
|
old_mod = find_dict_with_key_value(m,'name','Online Student Support Services - Summer & Fall 2024')
|
|
|
|
print(" this course has the old module")
|
|
checklist.append(f"{C['id']}")
|
|
titles = extract_key_values(old_mod, 'title')
|
|
[ print(f" {T}") for T in titles ]
|
|
|
|
print(f"\nCheck these course ids:")
|
|
for id in checklist:
|
|
print(id)
|
|
|
|
|
|
def repair_ezproxy_links():
|
|
from localcache2 import pages_in_term
|
|
|
|
# get all pages in term
|
|
all_pages = pages_in_term()
|
|
|
|
# c.id, c.course_code, c.sis_source_id, wp.id as wp_id, wp.title, wp.url, c.name , wp.body
|
|
for p in all_pages:
|
|
course = p[1]
|
|
title = p[4]
|
|
url = p[5]
|
|
body = p[7]
|
|
# print(body)
|
|
try:
|
|
#s = re.search('''["']https:\/\/ezproxy\.gavilan\.edu\/login\?url=(.*)["']''',body)
|
|
a = re.search(r'Online Library Services',title)
|
|
if a:
|
|
continue
|
|
s = re.findall('\n.*ezproxy.*\n',body)
|
|
if s:
|
|
print(course, title, url)
|
|
print(" ", s, "\n") # s.group())
|
|
except Exception as e:
|
|
#print(f"Skipped: {title}, {e}")
|
|
pass
|
|
|
|
|
|
|
|
def download_web():
|
|
import argparse, os, re, time, hashlib, mimetypes, subprocess
|
|
from collections import deque
|
|
from urllib.parse import urlsplit, urlunsplit, urljoin
|
|
import posixpath as ppath
|
|
import requests
|
|
from lxml import html
|
|
|
|
SESSION = requests.Session()
|
|
SESSION.headers.update({
|
|
"User-Agent": "MiniXPathCrawler/1.0 (+for personal archiving; contact admin if issues)"
|
|
})
|
|
|
|
def normalize_path(path: str) -> str:
|
|
np = ppath.normpath(path or "/")
|
|
if not np.startswith("/"):
|
|
np = "/" + np
|
|
return np
|
|
|
|
def base_dir_of(path: str) -> str:
|
|
# Ensure trailing slash for folder comparison
|
|
if not path or path.endswith("/"):
|
|
bd = path or "/"
|
|
else:
|
|
bd = ppath.dirname(path) + "/"
|
|
bd = normalize_path(bd)
|
|
if not bd.endswith("/"):
|
|
bd += "/"
|
|
return bd
|
|
|
|
def canonical_url(u: str, drop_query=True) -> str:
|
|
sp = urlsplit(u)
|
|
path = normalize_path(sp.path)
|
|
if drop_query:
|
|
sp = sp._replace(path=path, query="", fragment="")
|
|
else:
|
|
sp = sp._replace(path=path, fragment="")
|
|
return urlunsplit(sp)
|
|
|
|
def same_folder_or_below(start_url: str, link_url: str) -> bool:
|
|
su = urlsplit(start_url); lu = urlsplit(link_url)
|
|
if su.scheme != lu.scheme or su.netloc != lu.netloc:
|
|
return False
|
|
bd = base_dir_of(su.path) # e.g., "/a/b/"
|
|
tp = normalize_path(lu.path) # e.g., "/a/b/page.html"
|
|
return (tp == bd[:-1]) or tp.startswith(bd)
|
|
|
|
def is_html_response(resp: requests.Response) -> bool:
|
|
ctype = resp.headers.get("Content-Type", "")
|
|
return "html" in ctype.lower()
|
|
|
|
def fetch_html(url: str, timeout=20):
|
|
try:
|
|
r = SESSION.get(url, timeout=timeout, allow_redirects=True)
|
|
except requests.RequestException:
|
|
return None, None
|
|
if r.status_code != 200 or not is_html_response(r):
|
|
return None, None
|
|
try:
|
|
doc = html.fromstring(r.content)
|
|
except Exception:
|
|
return None, None
|
|
# make links absolute for easier handling of images and hrefs
|
|
doc.make_links_absolute(r.url)
|
|
return r, doc
|
|
|
|
def safe_filename_from_url(u: str, default_ext=".bin") -> str:
|
|
# hash + best-effort extension
|
|
h = hashlib.sha1(u.encode("utf-8")).hexdigest()[:16]
|
|
ext = ""
|
|
path = urlsplit(u).path
|
|
if "." in path:
|
|
ext = "." + path.split(".")[-1].split("?")[0].split("#")[0]
|
|
if not re.match(r"^\.[A-Za-z0-9]{1,5}$", ext):
|
|
ext = ""
|
|
return h + (ext or default_ext)
|
|
|
|
def download_image(img_url: str, assets_dir: str) -> str | None:
|
|
try:
|
|
r = SESSION.get(img_url, timeout=20, stream=True)
|
|
except requests.RequestException:
|
|
return None
|
|
if r.status_code != 200:
|
|
return None
|
|
# extension: prefer from Content-Type
|
|
ext = None
|
|
ctype = r.headers.get("Content-Type", "")
|
|
if "/" in ctype:
|
|
ext_guess = mimetypes.guess_extension(ctype.split(";")[0].strip())
|
|
if ext_guess:
|
|
ext = ext_guess
|
|
fname = safe_filename_from_url(img_url, default_ext=ext or ".img")
|
|
os.makedirs(assets_dir, exist_ok=True)
|
|
fpath = os.path.join(assets_dir, fname)
|
|
try:
|
|
with open(fpath, "wb") as f:
|
|
for chunk in r.iter_content(65536):
|
|
if chunk:
|
|
f.write(chunk)
|
|
except Exception:
|
|
return None
|
|
return fpath
|
|
|
|
def html_fragment_from_xpath(doc, xpath_expr: str, assets_dir: str):
|
|
nodes = doc.xpath(xpath_expr)
|
|
if not nodes:
|
|
return None, None # (html_fragment, title)
|
|
# Remove <script>/<style> inside nodes
|
|
for n in nodes:
|
|
for bad in n.xpath(".//script|.//style|.//noscript"):
|
|
bad.getparent().remove(bad)
|
|
|
|
# Download images and rewrite src
|
|
for n in nodes:
|
|
for img in n.xpath(".//img[@src]"):
|
|
src = img.get("src")
|
|
if not src:
|
|
continue
|
|
local = download_image(src, assets_dir)
|
|
if local:
|
|
# Use relative path from markdown file location later (we'll keep md in parent of assets)
|
|
rel = os.path.join("assets", os.path.basename(local)).replace("\\", "/")
|
|
img.set("src", rel)
|
|
|
|
frag_html = "".join(html.tostring(n, encoding="unicode") for n in nodes)
|
|
# Title from <title> or first heading in fragment
|
|
doc_title = (doc.xpath("string(//title)") or "").strip()
|
|
if not doc_title:
|
|
h = html.fromstring(frag_html)
|
|
t2 = (h.xpath("string(//h1)") or h.xpath("string(//h2)") or "").strip()
|
|
doc_title = t2 or "Untitled"
|
|
return frag_html, doc_title
|
|
|
|
def html_to_markdown_with_pandoc(html_str: str) -> str:
|
|
try:
|
|
p = subprocess.run(
|
|
["pandoc", "-f", "html", "-t", "gfm"],
|
|
input=html_str.encode("utf-8"),
|
|
stdout=subprocess.PIPE,
|
|
stderr=subprocess.PIPE,
|
|
check=False,
|
|
)
|
|
if p.returncode == 0:
|
|
return p.stdout.decode("utf-8", errors="ignore")
|
|
# fallback to raw HTML if conversion failed
|
|
return html_str
|
|
except FileNotFoundError:
|
|
# pandoc missing; return raw HTML
|
|
return html_str
|
|
|
|
def build_docx_from_markdown(md_path: str, out_docx: str, resource_path: str):
|
|
# Create .docx with ToC
|
|
cmd = [
|
|
"pandoc",
|
|
"-s",
|
|
md_path,
|
|
"-o",
|
|
out_docx,
|
|
"--toc",
|
|
"--toc-depth=3",
|
|
f"--resource-path={resource_path}",
|
|
"--from=markdown+raw_html",
|
|
]
|
|
subprocess.run(cmd, check=True)
|
|
|
|
def crawl(start_url: str, xpath_expr: str, out_dir: str, max_pages: int, delay: float):
|
|
os.makedirs(out_dir, exist_ok=True)
|
|
assets_dir = os.path.join(out_dir, "assets")
|
|
os.makedirs(assets_dir, exist_ok=True)
|
|
|
|
visited = set()
|
|
q = deque([start_url])
|
|
|
|
md_sections = []
|
|
|
|
base_folder = base_dir_of(urlsplit(start_url).path)
|
|
|
|
while q and len(visited) < max_pages:
|
|
url = q.popleft()
|
|
canon = canonical_url(url)
|
|
if canon in visited:
|
|
continue
|
|
visited.add(canon)
|
|
|
|
resp, doc = fetch_html(url)
|
|
if doc is None:
|
|
print(f"[skip] Non-HTML or fetch failed: {url}")
|
|
continue
|
|
|
|
# Extract and rewrite images for the chosen XPath fragment
|
|
frag_html, title = html_fragment_from_xpath(doc, xpath_expr, assets_dir)
|
|
if frag_html:
|
|
md = html_to_markdown_with_pandoc(frag_html)
|
|
section = f"# {title}\n\n_Source: {resp.url}_\n\n{md}\n"
|
|
md_sections.append(section)
|
|
print(f"[ok] {resp.url}")
|
|
|
|
# Enqueue in-scope links (from the whole page)
|
|
for a in doc.xpath("//a[@href]"):
|
|
href = a.get("href")
|
|
if not href:
|
|
continue
|
|
absu = urljoin(resp.url, href)
|
|
# Drop fragments for comparison/enqueue
|
|
absu_nf = urlunsplit(urlsplit(absu)._replace(fragment=""))
|
|
if absu_nf in visited:
|
|
continue
|
|
if same_folder_or_below(start_url, absu_nf):
|
|
q.append(absu_nf)
|
|
|
|
time.sleep(delay)
|
|
|
|
merged_md = os.path.join(out_dir, "merged.md")
|
|
with open(merged_md, "w", encoding="utf-8") as f:
|
|
f.write("\n\n".join(md_sections))
|
|
|
|
out_docx = os.path.join(out_dir, "merged.docx")
|
|
try:
|
|
build_docx_from_markdown(merged_md, out_docx, out_dir)
|
|
except subprocess.CalledProcessError as e:
|
|
print("[warn] pandoc failed to create .docx:", e)
|
|
|
|
print(f"\nDone.\nMarkdown: {merged_md}\nWord: {out_docx}\nPages: {len(md_sections)} (in scope)")
|
|
|
|
myurl = "https://govt.westlaw.com/calregs/Browse/Home/California/CaliforniaCodeofRegulations?guid=I2A5DA5204C6911EC93A8000D3A7C4BC3&originationContext=documenttoc&transitionType=Default&contextData=(sc.Default)"
|
|
crawl(myurl, '//*[@id="co_contentColumn"]', "cache/content", 600, 0.65)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def flowgrid():
|
|
# a tiny DSL for lane/step "flow grid" diagrams rendered to HTML.
|
|
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import List, Optional, Dict
|
|
import re
|
|
import html
|
|
|
|
# ---------------------- Data model ----------------------
|
|
|
|
@dataclass
|
|
class Step:
|
|
code: str
|
|
label: str
|
|
weeks: Optional[str] = None
|
|
hours: Optional[str] = None
|
|
tag: Optional[str] = None # 'req' | 'rec' | None
|
|
desc: Optional[str] = None # override/extra text for subline
|
|
klass: Optional[str] = None # additional css class
|
|
|
|
@dataclass
|
|
class Lane:
|
|
name: str
|
|
steps: List[Step] = field(default_factory=list)
|
|
|
|
@dataclass
|
|
class Doc:
|
|
title: str
|
|
lanes: List[Lane] = field(default_factory=list)
|
|
css_vars: Dict[str, str] = field(default_factory=dict)
|
|
|
|
# ---------------------- Parser ----------------------
|
|
|
|
def parse_spec(text: str) -> Doc:
|
|
"""
|
|
DSL syntax:
|
|
- Comments start with '#'
|
|
- KEY: value (supported keys: TITLE, VAR)
|
|
VAR: --step=260px; --arrow=34px; --done=110px (semicolon separated; optional)
|
|
- LANE: <name>
|
|
STEP: CODE | LABEL | weeks=2; hours=20; tag=req
|
|
STEP: CODE | LABEL | desc=1 hour on-site; tag=rec
|
|
- Empty lines are ignored.
|
|
- Indentation is optional and only for readability.
|
|
"""
|
|
title = "Untitled Diagram"
|
|
lanes: List[Lane] = []
|
|
current_lane: Optional[Lane] = None
|
|
css_vars: Dict[str, str] = {}
|
|
|
|
for raw in text.splitlines():
|
|
line = raw.strip()
|
|
if not line or line.startswith("#"):
|
|
continue
|
|
|
|
# KEY: value
|
|
m = re.match(r'(?i)TITLE\s*:\s*(.+)$', line)
|
|
if m:
|
|
title = m.group(1).strip()
|
|
continue
|
|
|
|
# VAR line
|
|
m = re.match(r'(?i)VAR\s*:\s*(.+)$', line)
|
|
if m:
|
|
# semicolon separated k=v; allow CSS custom props like --step=300px
|
|
blob = m.group(1)
|
|
parts = [p.strip() for p in blob.split(";") if p.strip()]
|
|
for p in parts:
|
|
if "=" in p:
|
|
k, v = p.split("=", 1)
|
|
css_vars[k.strip()] = v.strip()
|
|
continue
|
|
|
|
# LANE
|
|
m = re.match(r'(?i)LANE\s*:\s*(.+)$', line)
|
|
if m:
|
|
current_lane = Lane(name=m.group(1).strip())
|
|
lanes.append(current_lane)
|
|
continue
|
|
|
|
# STEP
|
|
m = re.match(r'(?i)STEP\s*:\s*(.+)$', line)
|
|
if m:
|
|
if current_lane is None:
|
|
raise ValueError("STEP appears before any LANE is defined.")
|
|
body = m.group(1)
|
|
# Expect: CODE | LABEL | attrs
|
|
parts = [p.strip() for p in body.split("|")]
|
|
if len(parts) < 2:
|
|
raise ValueError(f"STEP needs 'CODE | LABEL | ...' got: {body}")
|
|
code = parts[0]
|
|
label = parts[1]
|
|
attrs_blob = parts[2] if len(parts) >=3 else ""
|
|
|
|
# Parse attrs: key=value; key=value
|
|
step_kwargs = {}
|
|
if attrs_blob:
|
|
for kv in [a.strip() for a in attrs_blob.split(";") if a.strip()]:
|
|
if "=" in kv:
|
|
k, v = kv.split("=", 1)
|
|
step_kwargs[k.strip().lower()] = v.strip()
|
|
else:
|
|
# allow bare tag 'req' or 'rec'
|
|
if kv.lower() in ("req", "rec"):
|
|
step_kwargs["tag"] = kv.lower()
|
|
|
|
step = Step(
|
|
code=code,
|
|
label=label,
|
|
weeks=step_kwargs.get("weeks") or step_kwargs.get("w"),
|
|
hours=step_kwargs.get("hours") or step_kwargs.get("hrs") or step_kwargs.get("h"),
|
|
tag=normalize_tag(step_kwargs.get("tag")),
|
|
desc=step_kwargs.get("desc"),
|
|
klass=step_kwargs.get("class") or step_kwargs.get("klass"),
|
|
)
|
|
current_lane.steps.append(step)
|
|
continue
|
|
|
|
raise ValueError(f"Unrecognized line: {line}")
|
|
|
|
return Doc(title=title, lanes=lanes, css_vars=css_vars)
|
|
|
|
def normalize_tag(tag: Optional[str]) -> Optional[str]:
|
|
if not tag:
|
|
return None
|
|
t = tag.lower().strip()
|
|
if t in ("req", "required"):
|
|
return "req"
|
|
if t in ("rec", "recommended"):
|
|
return "rec"
|
|
if t in ("none", "na", "n/a", "optional"):
|
|
return None
|
|
return t
|
|
|
|
# ---------------------- HTML rendering ----------------------
|
|
|
|
BASE_CSS = r"""
|
|
:root{
|
|
--ink:#0f172a;
|
|
--reqBorder:#2e7d32; --reqFill:#eef7ef;
|
|
--recBorder:#8a8a8a; --recFill:#ffffff;
|
|
--doneBorder:#9ca3af; --doneInk:#475569;
|
|
--modeCol:180px; --gap:12px;
|
|
--step:260px; --arrow:34px; --done:110px;
|
|
}
|
|
html,body{margin:0;background:#f6f7fb;font-family:system-ui,-apple-system,Segoe UI,Roboto,Arial,sans-serif;color:var(--ink)}
|
|
.wrap{margin:24px auto 48px;padding:0 16px}
|
|
h1{font-size:22px;margin:0 0 8px}
|
|
.legend{display:flex;gap:18px;align-items:center;font-size:14px;margin:6px 0 24px}
|
|
.tag{display:inline-block;padding:2px 8px;border-radius:999px;border:1.5px solid var(--reqBorder);background:var(--reqFill);font-size:12px}
|
|
.tag.rec{border-color:var(--recBorder);background:var(--recFill);border-style:dashed}
|
|
.grid{display:flex;flex-direction:column;gap:18px}
|
|
.lane{display:grid;grid-template-columns:var(--modeCol) 1fr;gap:var(--gap);align-items:center;background:#ffffffcc;padding:12px;border-radius:12px}
|
|
.mode{font-weight:700;text-align:center;background:#fff;padding:16px 10px}
|
|
.flow{display:grid;align-items:center;gap:8px;padding:8px 0;}
|
|
.header {grid-column: span 4; }
|
|
.step{border-radius:10px;padding:10px 12px;border:2px solid var(--reqBorder);background:var(--reqFill);min-height:64px}
|
|
.step .title{font-weight:700}
|
|
.step .sub{font-size:12px;opacity:.8}
|
|
.step.rec{border-color:var(--recBorder);border-style:dashed;background:var(--recFill)}
|
|
.slot{}
|
|
.arrow{font-size:22px;line-height:1;text-align:center}
|
|
.arrow.blank{color:transparent}
|
|
.done{justify-self:start;border-radius:999px;border:2px dashed var(--doneBorder);padding:10px 14px;color:var(--doneInk);background:#fff;text-align:center}
|
|
@media (max-width:900px){
|
|
.lane{grid-template-columns:1fr}
|
|
.mode{order:-1}
|
|
.flow{grid-template-columns:1fr; background:none}
|
|
.arrow{display:none}
|
|
}
|
|
"""
|
|
|
|
def format_sub(step: Step) -> str:
|
|
if step.desc:
|
|
core = html.escape(step.desc)
|
|
else:
|
|
bits = [html.escape(step.label)]
|
|
wh = []
|
|
if step.weeks:
|
|
wh.append(f"{html.escape(str(step.weeks))} weeks")
|
|
if step.hours:
|
|
wh.append(f"~{html.escape(str(step.hours))} hrs")
|
|
if wh:
|
|
bits.append(" · " + " (".join([wh[0], " ".join(wh[1:])]) + ")" if len(wh)>1 else " · " + wh[0])
|
|
# Actually, the original used " · 2 weeks (~20 hrs)"
|
|
# Let's just do that directly:
|
|
if step.weeks and step.hours:
|
|
bits[-1] = f" · {html.escape(str(step.weeks))} weeks (~{html.escape(str(step.hours))} hrs)"
|
|
# Combine
|
|
core = "".join(bits)
|
|
# Tag
|
|
if step.tag == "req":
|
|
tag_html = '<span class="tag">Required</span>'
|
|
elif step.tag == "rec":
|
|
tag_html = '<span class="tag rec">Recommended</span>'
|
|
else:
|
|
tag_html = ""
|
|
if tag_html:
|
|
return f'{core} · {tag_html}'
|
|
return core
|
|
|
|
def render_html(doc: Doc) -> str:
|
|
max_steps = max((len(l.steps) for l in doc.lanes), default=1)
|
|
# grid-template-columns: repeat(max_steps, var(--step) var(--arrow)) var(--done)
|
|
pairs = " ".join(["var(--step) var(--arrow)"] * max_steps) + " var(--done)"
|
|
css_vars_block = ""
|
|
if doc.css_vars:
|
|
css_vars_block = ":root{\n" + "\n".join([f" {k}: {v};" for k,v in doc.css_vars.items()]) + "\n}\n"
|
|
|
|
html_parts = []
|
|
html_parts.append("<!DOCTYPE html><html><head><meta charset='utf-8'>")
|
|
html_parts.append("<meta name='viewport' content='width=device-width, initial-scale=1'>")
|
|
html_parts.append("<title>" + html.escape(doc.title) + "</title>")
|
|
html_parts.append("<style>")
|
|
html_parts.append(BASE_CSS)
|
|
if css_vars_block:
|
|
html_parts.append(css_vars_block)
|
|
html_parts.append(f".flow{{grid-template-columns:{pairs};}}")
|
|
html_parts.append("</style></head><body>")
|
|
html_parts.append("<div class='wrap'><div class='grid'>")
|
|
|
|
# Header/Title lane
|
|
html_parts.append("<div class='lane'><div class='mode'> </div><div class='flow'><div class='header'><h1>")
|
|
html_parts.append(html.escape(doc.title))
|
|
html_parts.append("</h1></div></div></div>")
|
|
|
|
for lane in doc.lanes:
|
|
html_parts.append("<div class='lane'>")
|
|
html_parts.append(f"<div class='mode'>{html.escape(lane.name)}</div>")
|
|
html_parts.append("<div class='flow'>")
|
|
for idx, step in enumerate(lane.steps):
|
|
cls = "step"
|
|
if step.tag == "rec":
|
|
cls += " rec"
|
|
if step.klass:
|
|
cls += " " + html.escape(step.klass)
|
|
html_parts.append(f"<div class='{cls}'>")
|
|
html_parts.append(f"<div class='title'>{html.escape(step.code)}</div>")
|
|
html_parts.append(f"<div class='sub'>{format_sub(step)}</div>")
|
|
html_parts.append("</div>") # step
|
|
# arrow after every step unless it's the last visible step
|
|
html_parts.append("<div class='arrow'>→</div>")
|
|
# Fill remaining slots (if any)
|
|
for _ in range(max_steps - len(lane.steps)):
|
|
html_parts.append("<div class='slot'></div>")
|
|
html_parts.append("<div class='arrow blank'>→</div>")
|
|
|
|
# Done bubble
|
|
html_parts.append("<div class='done'>Done</div>")
|
|
html_parts.append("</div></div>") # flow + lane
|
|
|
|
html_parts.append("</div></div></body></html>")
|
|
return "".join(html_parts)
|
|
|
|
|
|
spec_text = '''
|
|
TITLE: Online Teaching Requirements and Recommendations
|
|
# Optional CSS overrides
|
|
VAR: --step=180px; --modeCol=180px
|
|
|
|
LANE: In Person (with Canvas)
|
|
STEP: GOTT 1 | Intro to Online Teaching with Canvas | weeks=2; hours=20; tag=rec
|
|
|
|
LANE: Online
|
|
STEP: GOTT 1 | Intro to Online Teaching with Canvas | weeks=2; hours=20; tag=req
|
|
STEP: GOTT 2 | Introduction to Asynchronous Online Teaching and Learning | weeks=4; hours=40; tag=req
|
|
|
|
LANE: Hybrid
|
|
STEP: GOTT 1 | Intro to Online Teaching with Canvas | weeks=2; hours=20; tag=req
|
|
STEP: GOTT 2 | Introduction to Asynchronous Online Teaching and Learning | weeks=4; hours=40; tag=req
|
|
STEP: GOTT 5 | Essentials of Blended Learning | weeks=2; hours=20; tag=rec
|
|
|
|
LANE: Online Live
|
|
STEP: GOTT 1 | Intro to Online Teaching with Canvas | weeks=2; hours=20; tag=req
|
|
STEP: GOTT 2 | Introduction to Asynchronous Online Teaching and Learning | weeks=4; hours=40; tag=req
|
|
STEP: GOTT 6 | Introduction to Live Online Teaching and Learning | weeks=2; hours=20; tag=rec
|
|
|
|
LANE: HyFlex
|
|
STEP: GOTT 1 | Intro to Online Teaching with Canvas | weeks=2; hours=20; tag=req
|
|
STEP: GOTT 2 | Introduction to Asynchronous Online Teaching and Learning | weeks=4; hours=40; tag=req
|
|
STEP: GOTT 6 | Introduction to Live Online Teaching and Learning | weeks=2; hours=20; tag=rec
|
|
# You can override the subline using desc=
|
|
STEP: HyFlex Tech Training | ~1 hour on-site | desc=~1 hour on-site; tag=rec
|
|
|
|
'''
|
|
doc = parse_spec(spec_text)
|
|
out_html = render_html(doc)
|
|
Path('cache/flow.html').write_text(out_html, encoding="utf-8")
|
|
print(f"Wrote cache/flow.html")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
print ('')
|
|
options = { 1: ['download a class into a folder / word file', course_download] ,
|
|
2: ['download multiple classes', multiple_downloads ],
|
|
3: ['convert stuff', pan_testing ],
|
|
4: ['convert md to html', md_to_course ],
|
|
5: ['course download tester', test_forums ],
|
|
6: ['download all a courses pages', grab_course_pages],
|
|
7: ['quick site downloader', download_web],
|
|
17: ['repair ezproxy links', repair_ezproxy_links],
|
|
18: ['create pages from html files', make_pages_from_folder],
|
|
19: ['fetch support page', fetch_support_page],
|
|
20: ['create support page', create_support_page],
|
|
21: ['add support page to all shells in semester', add_support_page_full_semester],
|
|
22: ['fetch all modules / items', check_modules_for_old_orientation],
|
|
30: ['media fetch', media_testing],
|
|
40: ['flow grid', flowgrid],
|
|
}
|
|
|
|
if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
|
|
resp = int(sys.argv[1])
|
|
print("\n\nPerforming: %s\n\n" % options[resp][0])
|
|
|
|
else:
|
|
print ('')
|
|
for key in options:
|
|
print(str(key) + '.\t' + options[key][0])
|
|
|
|
print('')
|
|
resp = input('Choose: ')
|
|
|
|
# Call the function in the options dict
|
|
options[ int(resp)][1]()
|
|
|