course downloading

This commit is contained in:
Coding with Peter 2024-04-17 07:25:10 -07:00
parent 84f0a97529
commit 1c3f9dbf10
1 changed files with 171 additions and 85 deletions

View File

@ -14,24 +14,18 @@ from sentence_transformers import SentenceTransformer, util
h = HTMLParser()
pagebreak = '\n\n<!-- BREAK -->\n\n'
DBG = 1
def d(s):
global DBG
if DBG: print(s)
# Download everything interesting in a course to a local folder
# Build a master file with the entire class content
def accessible_check(id=""):
def test_forums(id=0):
if not id:
id = input("ID of course to check? ")
pagebreak = '\n\n<!-- BREAK -->\n\n'
verbose = 1
save_file_types = ['application/pdf','application/docx','image/jpg','image/png','image/gif','image/webp','application/vnd.openxmlformats-officedocument.wordprocessingml.document']
courseinfo = fetch('/api/v1/courses/' + str(id), verbose )
item_id_to_index = {}
@ -43,6 +37,144 @@ def accessible_check(id=""):
items = []
for x in range(9000): items.append(0)
for m in modules:
items[running_index] = '<h2>%s</h2>%s\n' % ( m['name'], pagebreak )
running_index += 1
mod_items = fetch('/api/v1/courses/' + str(id) + '/modules/'+str(m['id'])+'/items', verbose)
for I in mod_items:
if I['type'] in ['SubHeader', 'Page', 'Quiz', 'Discussion', 'ExternalUrl' ] or 'content_id' in I:
running_index += 1
if I['type'] == 'SubHeader':
#print('subheader: ' + str(I))
items[running_index] = '<h3>%s</h3>\n' % str(json.dumps(I,indent=2))
if I['type'] == 'Page':
item_id_to_index[ I['page_url'] ] = running_index
if I['type'] == 'Quiz':
item_id_to_index[ I['content_id'] ] = running_index
if I['type'] == 'Discussion':
item_id_to_index[ I['content_id'] ] = running_index
if I['type'] == 'ExternalUrl':
items[running_index] = "<a href='%s'>%s</a><br />\n\n" % (I['external_url'], I['title'])
# ?
#if 'content_id' in I:
# item_id_to_index[ I['content_id'] ] = running_index
else:
print("What is this item? " + str(I))
#items_inorder.append('<i>Not included: '+ I['title'] + '(a ' + I['type'] + ')</i>\n\n\n' )
# I['title']
# I['content_id']
# I['page_url']
# I['type']
# I['published']
# assignments and files have content_id, pages have page_url
course_folder = '../course_temps/course_'+id
index = []
try:
os.mkdir(course_folder)
except:
print("Course folder exists.")
index.extend( extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose) )
print(json.dumps(index,indent=2))
def write_message(fd, view, participants):
fd.write(f"<blockquote>\nfrom <b>{participants[view['user_id']]['display_name']}</b>:<br />\n{view['message']}\n<br />")
if 'replies' in view:
for r in view['replies']:
write_message(fd, r, participants)
fd.write("</blockquote>\n")
def extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose=0):
###
### FORUMS
###
index = []
forum_f = course_folder + '/forums'
headered = 0
image_count = 0
print("\nFORUMS")
try:
os.mkdir(forum_f)
forums = fetch('/api/v1/courses/' + str(id) + '/discussion_topics', verbose)
for p in forums:
p['title'] = clean_title(p['title'])
forum_id = p['id']
easier_filename = p['title']
for a in 'title,posted_at,published'.split(','):
print(str(p[a]), "\t", end=' ')
print("")
t2 = fetch(f"/api/v1/courses/{id}/discussion_topics/{forum_id}", verbose)
title = t2['title']
message = t2['message']
t2 = fetch(f"/api/v1/courses/{id}/discussion_topics/{forum_id}/view", verbose)
try:
participants = {x['id']:x for x in t2['participants']}
with codecs.open(forum_f + '/' + easier_filename + '.html', 'w','utf-8') as fd:
fd.write(f"<h1>{title}</h1>\n")
fd.write(message + "\n\n")
for v in t2['view']:
write_message(fd, v, participants)
if not headered: index.append( ('<br /><b>Discussion Forums</b><br />') )
headered = 1
index.append( ( 'forums/' + easier_filename + '.html', p['title'] ) )
# write to running log of content in order of module
if p['id'] in item_id_to_index:
items_inorder[ item_id_to_index[ p['id'] ] ] = f"<h1>{title}</h1>\n\n{message}\n\n{pagebreak}"
else:
print(' This forum didnt seem to be in the modules list.')
except Exception as e:
print("Error here:", e)
#print p
#print results_dict
except Exception as e:
print("** Forum folder seems to exist. Skipping those.")
print(e)
return index
# Download everything interesting in a course to a local folder
# Build a master file with the entire class content
def accessible_check(id=""):
if not id:
id = input("ID of course to check? ")
verbose = 1
save_file_types = ['application/pdf','application/docx','image/jpg','image/png','image/gif','image/webp','application/vnd.openxmlformats-officedocument.wordprocessingml.document']
courseinfo = fetch('/api/v1/courses/' + str(id), verbose )
# reverse lookup into items array
item_id_to_index = {}
# is it used?
items_inorder = ["<font size='24'>" + courseinfo['name'] + "</font>\n\n" + pagebreak,]
running_index = 1
modules = fetch('/api/v1/courses/' + str(id) + '/modules',verbose)
# headers / module names
items = []
for x in range(9000): items.append(0)
video_link_list = []
for m in modules:
@ -89,6 +221,8 @@ def accessible_check(id=""):
# assignments and files have content_id, pages have page_url
course_folder = '../course_temps/course_'+id
# list of each item, organized by item type. Tuples of (url,title)
index = []
try:
os.mkdir(course_folder)
@ -159,7 +293,7 @@ def accessible_check(id=""):
if os.path.exists(this_page_filename):
d(" - already downloaded %s" % this_page_filename)
this_page_content = open(this_page_filename,'r').read()
this_page_content = codecs.open(this_page_filename,'r','utf-8').read()
elif re.search(r'eis-prod',p['url']) or re.search(r'gavilan\.ins',p['url']):
d(' * skipping file behind passwords')
else:
@ -168,18 +302,22 @@ def accessible_check(id=""):
bb = bs(t2['body'],features="lxml")
a_links = bb.find_all('a')
for A in a_links:
if re.search( r'youtu', A['href']):
video_link_list.append( (A['href'], A.text, 'pages/'+easier_filename + ".html") )
href = A.get('href')
if href and re.search( r'youtu',href):
video_link_list.append( (A.get('href'), A.text, 'pages/'+easier_filename + ".html") )
page_images = bb.find_all('img')
for I in page_images:
d(' - %s' % I['src'])
if re.search(r'eis-prod',I['src']) or re.search(r'gavilan\.ins',I['src']):
src = I.get('src')
if src:
d(' - %s' % src)
if re.search(r'eis-prod', src) or re.search(r'gavilan\.ins', src):
d(' * skipping file behind passwords')
else:
try:
r = requests.get(I['src'],headers=header, stream=True)
r = requests.get(src,headers=header, stream=True)
mytype = r.headers['content-type']
#print("Response is type: " + str(mytype))
r_parts = mytype.split("/")
@ -249,63 +387,10 @@ def accessible_check(id=""):
###
### FORUMS
###
"""forum_f = course_folder + '/forums'
headered = 0
image_count = 0
print("\nFORUMS")
try:
os.mkdir(forum_f)
forums = fetch('/api/v1/courses/' + str(id) + '/discussion_topics', verbose)
for p in forums:
p['title'] = clean_title(p['title'])
forum_id = p['id']
easier_filename = p['title']
for a in 'title,posted_at,published'.split(','):
print(str(p[a]), "\t", end=' ')
print("")
t2 = fetch('/api/v1/courses/' + str(id) + '/discussion_topics/'+str(forum_id), verbose)
#### REMOVED
bb = bs(t2['body'],features="lxml")
print("IMAGES IN THIS PAGE")
page_images = bb.find_all('img')
for I in page_images:
r = requests.get(I['src'],headers=header, stream=True)
mytype = r.headers['content-type']
print("Response is type: " + str(mytype))
r_parts = mytype.split("/")
ending = r_parts[-1]
with open(pages_f + '/' + str(image_count) + "." + ending, 'wb') as fd:
for chunk in r.iter_content(chunk_size=128):
fd.write(chunk)
image_count += 1
#### END REMOVED
try:
with codecs.open(forum_f + '/' + easier_filename + '.html', 'w','utf-8') as fd:
fd.write("<h1>"+t2['title']+"</h1>\n")
fd.write(t2['message'])
if not headered: index.append( ('<br /><b>Discussion Forums</b><br />') )
headered = 1
index.append( ( 'forums/' + easier_filename + '.html', p['title'] ) )
# write to running log of content in order of module
if p['id'] in item_id_to_index:
items_inorder[ item_id_to_index[ p['id'] ] ] = '<h1>'+t2['title']+'</h1>\n\n'+t2['message']+'\n\n'+pagebreak
else:
print(' This forum didnt seem to be in the modules list.')
except Exception as e:
print("Error here:", e)
#print p
#print results_dict
except Exception as e:
print("** Forum folder seems to exist. Skipping those.")
print(e)
index.extend( extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose) )
"""
@ -1286,6 +1371,7 @@ if __name__ == "__main__":
2: ['download multiple classes', multiple_downloads ],
3: ['convert stuff', pan_testing ],
4: ['convert md to html', md_to_course ],
5: ['course download tester', test_forums ],
# 5: ['import freshdesk content', freshdesk ],
6: ['download all a courses pages', grab_course_pages],
7: ['demo vector search', demo_vector_search],