course downloading
This commit is contained in:
parent
84f0a97529
commit
1c3f9dbf10
224
content.py
224
content.py
|
|
@ -14,24 +14,18 @@ from sentence_transformers import SentenceTransformer, util
|
||||||
|
|
||||||
h = HTMLParser()
|
h = HTMLParser()
|
||||||
|
|
||||||
|
pagebreak = '\n\n<!-- BREAK -->\n\n'
|
||||||
DBG = 1
|
DBG = 1
|
||||||
|
|
||||||
def d(s):
|
def d(s):
|
||||||
global DBG
|
global DBG
|
||||||
if DBG: print(s)
|
if DBG: print(s)
|
||||||
|
|
||||||
|
def test_forums(id=0):
|
||||||
|
|
||||||
# Download everything interesting in a course to a local folder
|
|
||||||
# Build a master file with the entire class content
|
|
||||||
def accessible_check(id=""):
|
|
||||||
if not id:
|
if not id:
|
||||||
id = input("ID of course to check? ")
|
id = input("ID of course to check? ")
|
||||||
pagebreak = '\n\n<!-- BREAK -->\n\n'
|
|
||||||
verbose = 1
|
verbose = 1
|
||||||
|
|
||||||
save_file_types = ['application/pdf','application/docx','image/jpg','image/png','image/gif','image/webp','application/vnd.openxmlformats-officedocument.wordprocessingml.document']
|
|
||||||
|
|
||||||
courseinfo = fetch('/api/v1/courses/' + str(id), verbose )
|
courseinfo = fetch('/api/v1/courses/' + str(id), verbose )
|
||||||
|
|
||||||
item_id_to_index = {}
|
item_id_to_index = {}
|
||||||
|
|
@ -43,6 +37,144 @@ def accessible_check(id=""):
|
||||||
items = []
|
items = []
|
||||||
for x in range(9000): items.append(0)
|
for x in range(9000): items.append(0)
|
||||||
|
|
||||||
|
for m in modules:
|
||||||
|
items[running_index] = '<h2>%s</h2>%s\n' % ( m['name'], pagebreak )
|
||||||
|
running_index += 1
|
||||||
|
|
||||||
|
mod_items = fetch('/api/v1/courses/' + str(id) + '/modules/'+str(m['id'])+'/items', verbose)
|
||||||
|
|
||||||
|
for I in mod_items:
|
||||||
|
|
||||||
|
if I['type'] in ['SubHeader', 'Page', 'Quiz', 'Discussion', 'ExternalUrl' ] or 'content_id' in I:
|
||||||
|
running_index += 1
|
||||||
|
|
||||||
|
if I['type'] == 'SubHeader':
|
||||||
|
#print('subheader: ' + str(I))
|
||||||
|
items[running_index] = '<h3>%s</h3>\n' % str(json.dumps(I,indent=2))
|
||||||
|
|
||||||
|
if I['type'] == 'Page':
|
||||||
|
item_id_to_index[ I['page_url'] ] = running_index
|
||||||
|
|
||||||
|
if I['type'] == 'Quiz':
|
||||||
|
item_id_to_index[ I['content_id'] ] = running_index
|
||||||
|
|
||||||
|
if I['type'] == 'Discussion':
|
||||||
|
item_id_to_index[ I['content_id'] ] = running_index
|
||||||
|
|
||||||
|
if I['type'] == 'ExternalUrl':
|
||||||
|
items[running_index] = "<a href='%s'>%s</a><br />\n\n" % (I['external_url'], I['title'])
|
||||||
|
|
||||||
|
# ?
|
||||||
|
#if 'content_id' in I:
|
||||||
|
# item_id_to_index[ I['content_id'] ] = running_index
|
||||||
|
else:
|
||||||
|
print("What is this item? " + str(I))
|
||||||
|
|
||||||
|
|
||||||
|
#items_inorder.append('<i>Not included: '+ I['title'] + '(a ' + I['type'] + ')</i>\n\n\n' )
|
||||||
|
|
||||||
|
# I['title']
|
||||||
|
# I['content_id']
|
||||||
|
# I['page_url']
|
||||||
|
# I['type']
|
||||||
|
# I['published']
|
||||||
|
# assignments and files have content_id, pages have page_url
|
||||||
|
|
||||||
|
course_folder = '../course_temps/course_'+id
|
||||||
|
index = []
|
||||||
|
try:
|
||||||
|
os.mkdir(course_folder)
|
||||||
|
except:
|
||||||
|
print("Course folder exists.")
|
||||||
|
|
||||||
|
index.extend( extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose) )
|
||||||
|
print(json.dumps(index,indent=2))
|
||||||
|
|
||||||
|
def write_message(fd, view, participants):
|
||||||
|
fd.write(f"<blockquote>\nfrom <b>{participants[view['user_id']]['display_name']}</b>:<br />\n{view['message']}\n<br />")
|
||||||
|
if 'replies' in view:
|
||||||
|
for r in view['replies']:
|
||||||
|
write_message(fd, r, participants)
|
||||||
|
fd.write("</blockquote>\n")
|
||||||
|
|
||||||
|
def extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose=0):
|
||||||
|
###
|
||||||
|
### FORUMS
|
||||||
|
###
|
||||||
|
index = []
|
||||||
|
forum_f = course_folder + '/forums'
|
||||||
|
headered = 0
|
||||||
|
image_count = 0
|
||||||
|
print("\nFORUMS")
|
||||||
|
try:
|
||||||
|
os.mkdir(forum_f)
|
||||||
|
forums = fetch('/api/v1/courses/' + str(id) + '/discussion_topics', verbose)
|
||||||
|
for p in forums:
|
||||||
|
p['title'] = clean_title(p['title'])
|
||||||
|
forum_id = p['id']
|
||||||
|
easier_filename = p['title']
|
||||||
|
for a in 'title,posted_at,published'.split(','):
|
||||||
|
print(str(p[a]), "\t", end=' ')
|
||||||
|
print("")
|
||||||
|
t2 = fetch(f"/api/v1/courses/{id}/discussion_topics/{forum_id}", verbose)
|
||||||
|
title = t2['title']
|
||||||
|
message = t2['message']
|
||||||
|
|
||||||
|
t2 = fetch(f"/api/v1/courses/{id}/discussion_topics/{forum_id}/view", verbose)
|
||||||
|
try:
|
||||||
|
participants = {x['id']:x for x in t2['participants']}
|
||||||
|
with codecs.open(forum_f + '/' + easier_filename + '.html', 'w','utf-8') as fd:
|
||||||
|
fd.write(f"<h1>{title}</h1>\n")
|
||||||
|
fd.write(message + "\n\n")
|
||||||
|
for v in t2['view']:
|
||||||
|
write_message(fd, v, participants)
|
||||||
|
if not headered: index.append( ('<br /><b>Discussion Forums</b><br />') )
|
||||||
|
headered = 1
|
||||||
|
index.append( ( 'forums/' + easier_filename + '.html', p['title'] ) )
|
||||||
|
|
||||||
|
|
||||||
|
# write to running log of content in order of module
|
||||||
|
if p['id'] in item_id_to_index:
|
||||||
|
items_inorder[ item_id_to_index[ p['id'] ] ] = f"<h1>{title}</h1>\n\n{message}\n\n{pagebreak}"
|
||||||
|
else:
|
||||||
|
print(' This forum didnt seem to be in the modules list.')
|
||||||
|
except Exception as e:
|
||||||
|
print("Error here:", e)
|
||||||
|
#print p
|
||||||
|
#print results_dict
|
||||||
|
except Exception as e:
|
||||||
|
print("** Forum folder seems to exist. Skipping those.")
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
return index
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# Download everything interesting in a course to a local folder
|
||||||
|
# Build a master file with the entire class content
|
||||||
|
def accessible_check(id=""):
|
||||||
|
if not id:
|
||||||
|
id = input("ID of course to check? ")
|
||||||
|
verbose = 1
|
||||||
|
|
||||||
|
save_file_types = ['application/pdf','application/docx','image/jpg','image/png','image/gif','image/webp','application/vnd.openxmlformats-officedocument.wordprocessingml.document']
|
||||||
|
|
||||||
|
courseinfo = fetch('/api/v1/courses/' + str(id), verbose )
|
||||||
|
|
||||||
|
# reverse lookup into items array
|
||||||
|
item_id_to_index = {}
|
||||||
|
|
||||||
|
# is it used?
|
||||||
|
items_inorder = ["<font size='24'>" + courseinfo['name'] + "</font>\n\n" + pagebreak,]
|
||||||
|
running_index = 1
|
||||||
|
|
||||||
|
modules = fetch('/api/v1/courses/' + str(id) + '/modules',verbose)
|
||||||
|
|
||||||
|
# headers / module names
|
||||||
|
items = []
|
||||||
|
for x in range(9000): items.append(0)
|
||||||
|
|
||||||
video_link_list = []
|
video_link_list = []
|
||||||
|
|
||||||
for m in modules:
|
for m in modules:
|
||||||
|
|
@ -89,6 +221,8 @@ def accessible_check(id=""):
|
||||||
# assignments and files have content_id, pages have page_url
|
# assignments and files have content_id, pages have page_url
|
||||||
|
|
||||||
course_folder = '../course_temps/course_'+id
|
course_folder = '../course_temps/course_'+id
|
||||||
|
|
||||||
|
# list of each item, organized by item type. Tuples of (url,title)
|
||||||
index = []
|
index = []
|
||||||
try:
|
try:
|
||||||
os.mkdir(course_folder)
|
os.mkdir(course_folder)
|
||||||
|
|
@ -159,7 +293,7 @@ def accessible_check(id=""):
|
||||||
|
|
||||||
if os.path.exists(this_page_filename):
|
if os.path.exists(this_page_filename):
|
||||||
d(" - already downloaded %s" % this_page_filename)
|
d(" - already downloaded %s" % this_page_filename)
|
||||||
this_page_content = open(this_page_filename,'r').read()
|
this_page_content = codecs.open(this_page_filename,'r','utf-8').read()
|
||||||
elif re.search(r'eis-prod',p['url']) or re.search(r'gavilan\.ins',p['url']):
|
elif re.search(r'eis-prod',p['url']) or re.search(r'gavilan\.ins',p['url']):
|
||||||
d(' * skipping file behind passwords')
|
d(' * skipping file behind passwords')
|
||||||
else:
|
else:
|
||||||
|
|
@ -168,18 +302,22 @@ def accessible_check(id=""):
|
||||||
bb = bs(t2['body'],features="lxml")
|
bb = bs(t2['body'],features="lxml")
|
||||||
a_links = bb.find_all('a')
|
a_links = bb.find_all('a')
|
||||||
for A in a_links:
|
for A in a_links:
|
||||||
if re.search( r'youtu', A['href']):
|
href = A.get('href')
|
||||||
video_link_list.append( (A['href'], A.text, 'pages/'+easier_filename + ".html") )
|
|
||||||
|
if href and re.search( r'youtu',href):
|
||||||
|
video_link_list.append( (A.get('href'), A.text, 'pages/'+easier_filename + ".html") )
|
||||||
|
|
||||||
|
|
||||||
page_images = bb.find_all('img')
|
page_images = bb.find_all('img')
|
||||||
for I in page_images:
|
for I in page_images:
|
||||||
d(' - %s' % I['src'])
|
src = I.get('src')
|
||||||
if re.search(r'eis-prod',I['src']) or re.search(r'gavilan\.ins',I['src']):
|
if src:
|
||||||
|
d(' - %s' % src)
|
||||||
|
if re.search(r'eis-prod', src) or re.search(r'gavilan\.ins', src):
|
||||||
d(' * skipping file behind passwords')
|
d(' * skipping file behind passwords')
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
r = requests.get(I['src'],headers=header, stream=True)
|
r = requests.get(src,headers=header, stream=True)
|
||||||
mytype = r.headers['content-type']
|
mytype = r.headers['content-type']
|
||||||
#print("Response is type: " + str(mytype))
|
#print("Response is type: " + str(mytype))
|
||||||
r_parts = mytype.split("/")
|
r_parts = mytype.split("/")
|
||||||
|
|
@ -249,63 +387,10 @@ def accessible_check(id=""):
|
||||||
###
|
###
|
||||||
### FORUMS
|
### FORUMS
|
||||||
###
|
###
|
||||||
"""forum_f = course_folder + '/forums'
|
|
||||||
headered = 0
|
|
||||||
image_count = 0
|
|
||||||
print("\nFORUMS")
|
|
||||||
try:
|
|
||||||
os.mkdir(forum_f)
|
|
||||||
forums = fetch('/api/v1/courses/' + str(id) + '/discussion_topics', verbose)
|
|
||||||
for p in forums:
|
|
||||||
p['title'] = clean_title(p['title'])
|
|
||||||
forum_id = p['id']
|
|
||||||
easier_filename = p['title']
|
|
||||||
for a in 'title,posted_at,published'.split(','):
|
|
||||||
print(str(p[a]), "\t", end=' ')
|
|
||||||
print("")
|
|
||||||
t2 = fetch('/api/v1/courses/' + str(id) + '/discussion_topics/'+str(forum_id), verbose)
|
|
||||||
|
|
||||||
|
|
||||||
#### REMOVED
|
|
||||||
bb = bs(t2['body'],features="lxml")
|
|
||||||
print("IMAGES IN THIS PAGE")
|
|
||||||
page_images = bb.find_all('img')
|
|
||||||
for I in page_images:
|
|
||||||
r = requests.get(I['src'],headers=header, stream=True)
|
|
||||||
mytype = r.headers['content-type']
|
|
||||||
print("Response is type: " + str(mytype))
|
|
||||||
r_parts = mytype.split("/")
|
|
||||||
ending = r_parts[-1]
|
|
||||||
|
|
||||||
with open(pages_f + '/' + str(image_count) + "." + ending, 'wb') as fd:
|
|
||||||
for chunk in r.iter_content(chunk_size=128):
|
|
||||||
fd.write(chunk)
|
|
||||||
image_count += 1
|
|
||||||
#### END REMOVED
|
|
||||||
|
|
||||||
try:
|
|
||||||
with codecs.open(forum_f + '/' + easier_filename + '.html', 'w','utf-8') as fd:
|
|
||||||
fd.write("<h1>"+t2['title']+"</h1>\n")
|
|
||||||
fd.write(t2['message'])
|
|
||||||
if not headered: index.append( ('<br /><b>Discussion Forums</b><br />') )
|
|
||||||
headered = 1
|
|
||||||
index.append( ( 'forums/' + easier_filename + '.html', p['title'] ) )
|
|
||||||
|
|
||||||
# write to running log of content in order of module
|
|
||||||
if p['id'] in item_id_to_index:
|
|
||||||
items_inorder[ item_id_to_index[ p['id'] ] ] = '<h1>'+t2['title']+'</h1>\n\n'+t2['message']+'\n\n'+pagebreak
|
|
||||||
else:
|
|
||||||
print(' This forum didnt seem to be in the modules list.')
|
|
||||||
except Exception as e:
|
|
||||||
print("Error here:", e)
|
|
||||||
#print p
|
|
||||||
#print results_dict
|
|
||||||
except Exception as e:
|
|
||||||
print("** Forum folder seems to exist. Skipping those.")
|
|
||||||
print(e)
|
|
||||||
|
|
||||||
|
|
||||||
|
index.extend( extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose) )
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1286,6 +1371,7 @@ if __name__ == "__main__":
|
||||||
2: ['download multiple classes', multiple_downloads ],
|
2: ['download multiple classes', multiple_downloads ],
|
||||||
3: ['convert stuff', pan_testing ],
|
3: ['convert stuff', pan_testing ],
|
||||||
4: ['convert md to html', md_to_course ],
|
4: ['convert md to html', md_to_course ],
|
||||||
|
5: ['course download tester', test_forums ],
|
||||||
# 5: ['import freshdesk content', freshdesk ],
|
# 5: ['import freshdesk content', freshdesk ],
|
||||||
6: ['download all a courses pages', grab_course_pages],
|
6: ['download all a courses pages', grab_course_pages],
|
||||||
7: ['demo vector search', demo_vector_search],
|
7: ['demo vector search', demo_vector_search],
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue