course downloading
This commit is contained in:
parent
84f0a97529
commit
1c3f9dbf10
256
content.py
256
content.py
|
|
@ -14,24 +14,18 @@ from sentence_transformers import SentenceTransformer, util
|
|||
|
||||
h = HTMLParser()
|
||||
|
||||
pagebreak = '\n\n<!-- BREAK -->\n\n'
|
||||
DBG = 1
|
||||
|
||||
def d(s):
|
||||
global DBG
|
||||
if DBG: print(s)
|
||||
|
||||
|
||||
|
||||
# Download everything interesting in a course to a local folder
|
||||
# Build a master file with the entire class content
|
||||
def accessible_check(id=""):
|
||||
def test_forums(id=0):
|
||||
if not id:
|
||||
id = input("ID of course to check? ")
|
||||
pagebreak = '\n\n<!-- BREAK -->\n\n'
|
||||
verbose = 1
|
||||
|
||||
save_file_types = ['application/pdf','application/docx','image/jpg','image/png','image/gif','image/webp','application/vnd.openxmlformats-officedocument.wordprocessingml.document']
|
||||
|
||||
courseinfo = fetch('/api/v1/courses/' + str(id), verbose )
|
||||
|
||||
item_id_to_index = {}
|
||||
|
|
@ -42,6 +36,144 @@ def accessible_check(id=""):
|
|||
|
||||
items = []
|
||||
for x in range(9000): items.append(0)
|
||||
|
||||
for m in modules:
|
||||
items[running_index] = '<h2>%s</h2>%s\n' % ( m['name'], pagebreak )
|
||||
running_index += 1
|
||||
|
||||
mod_items = fetch('/api/v1/courses/' + str(id) + '/modules/'+str(m['id'])+'/items', verbose)
|
||||
|
||||
for I in mod_items:
|
||||
|
||||
if I['type'] in ['SubHeader', 'Page', 'Quiz', 'Discussion', 'ExternalUrl' ] or 'content_id' in I:
|
||||
running_index += 1
|
||||
|
||||
if I['type'] == 'SubHeader':
|
||||
#print('subheader: ' + str(I))
|
||||
items[running_index] = '<h3>%s</h3>\n' % str(json.dumps(I,indent=2))
|
||||
|
||||
if I['type'] == 'Page':
|
||||
item_id_to_index[ I['page_url'] ] = running_index
|
||||
|
||||
if I['type'] == 'Quiz':
|
||||
item_id_to_index[ I['content_id'] ] = running_index
|
||||
|
||||
if I['type'] == 'Discussion':
|
||||
item_id_to_index[ I['content_id'] ] = running_index
|
||||
|
||||
if I['type'] == 'ExternalUrl':
|
||||
items[running_index] = "<a href='%s'>%s</a><br />\n\n" % (I['external_url'], I['title'])
|
||||
|
||||
# ?
|
||||
#if 'content_id' in I:
|
||||
# item_id_to_index[ I['content_id'] ] = running_index
|
||||
else:
|
||||
print("What is this item? " + str(I))
|
||||
|
||||
|
||||
#items_inorder.append('<i>Not included: '+ I['title'] + '(a ' + I['type'] + ')</i>\n\n\n' )
|
||||
|
||||
# I['title']
|
||||
# I['content_id']
|
||||
# I['page_url']
|
||||
# I['type']
|
||||
# I['published']
|
||||
# assignments and files have content_id, pages have page_url
|
||||
|
||||
course_folder = '../course_temps/course_'+id
|
||||
index = []
|
||||
try:
|
||||
os.mkdir(course_folder)
|
||||
except:
|
||||
print("Course folder exists.")
|
||||
|
||||
index.extend( extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose) )
|
||||
print(json.dumps(index,indent=2))
|
||||
|
||||
def write_message(fd, view, participants):
|
||||
fd.write(f"<blockquote>\nfrom <b>{participants[view['user_id']]['display_name']}</b>:<br />\n{view['message']}\n<br />")
|
||||
if 'replies' in view:
|
||||
for r in view['replies']:
|
||||
write_message(fd, r, participants)
|
||||
fd.write("</blockquote>\n")
|
||||
|
||||
def extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose=0):
|
||||
###
|
||||
### FORUMS
|
||||
###
|
||||
index = []
|
||||
forum_f = course_folder + '/forums'
|
||||
headered = 0
|
||||
image_count = 0
|
||||
print("\nFORUMS")
|
||||
try:
|
||||
os.mkdir(forum_f)
|
||||
forums = fetch('/api/v1/courses/' + str(id) + '/discussion_topics', verbose)
|
||||
for p in forums:
|
||||
p['title'] = clean_title(p['title'])
|
||||
forum_id = p['id']
|
||||
easier_filename = p['title']
|
||||
for a in 'title,posted_at,published'.split(','):
|
||||
print(str(p[a]), "\t", end=' ')
|
||||
print("")
|
||||
t2 = fetch(f"/api/v1/courses/{id}/discussion_topics/{forum_id}", verbose)
|
||||
title = t2['title']
|
||||
message = t2['message']
|
||||
|
||||
t2 = fetch(f"/api/v1/courses/{id}/discussion_topics/{forum_id}/view", verbose)
|
||||
try:
|
||||
participants = {x['id']:x for x in t2['participants']}
|
||||
with codecs.open(forum_f + '/' + easier_filename + '.html', 'w','utf-8') as fd:
|
||||
fd.write(f"<h1>{title}</h1>\n")
|
||||
fd.write(message + "\n\n")
|
||||
for v in t2['view']:
|
||||
write_message(fd, v, participants)
|
||||
if not headered: index.append( ('<br /><b>Discussion Forums</b><br />') )
|
||||
headered = 1
|
||||
index.append( ( 'forums/' + easier_filename + '.html', p['title'] ) )
|
||||
|
||||
|
||||
# write to running log of content in order of module
|
||||
if p['id'] in item_id_to_index:
|
||||
items_inorder[ item_id_to_index[ p['id'] ] ] = f"<h1>{title}</h1>\n\n{message}\n\n{pagebreak}"
|
||||
else:
|
||||
print(' This forum didnt seem to be in the modules list.')
|
||||
except Exception as e:
|
||||
print("Error here:", e)
|
||||
#print p
|
||||
#print results_dict
|
||||
except Exception as e:
|
||||
print("** Forum folder seems to exist. Skipping those.")
|
||||
print(e)
|
||||
|
||||
return index
|
||||
|
||||
|
||||
|
||||
|
||||
# Download everything interesting in a course to a local folder
|
||||
# Build a master file with the entire class content
|
||||
def accessible_check(id=""):
|
||||
if not id:
|
||||
id = input("ID of course to check? ")
|
||||
verbose = 1
|
||||
|
||||
save_file_types = ['application/pdf','application/docx','image/jpg','image/png','image/gif','image/webp','application/vnd.openxmlformats-officedocument.wordprocessingml.document']
|
||||
|
||||
courseinfo = fetch('/api/v1/courses/' + str(id), verbose )
|
||||
|
||||
# reverse lookup into items array
|
||||
item_id_to_index = {}
|
||||
|
||||
# is it used?
|
||||
items_inorder = ["<font size='24'>" + courseinfo['name'] + "</font>\n\n" + pagebreak,]
|
||||
running_index = 1
|
||||
|
||||
modules = fetch('/api/v1/courses/' + str(id) + '/modules',verbose)
|
||||
|
||||
# headers / module names
|
||||
items = []
|
||||
for x in range(9000): items.append(0)
|
||||
|
||||
video_link_list = []
|
||||
|
||||
|
|
@ -89,6 +221,8 @@ def accessible_check(id=""):
|
|||
# assignments and files have content_id, pages have page_url
|
||||
|
||||
course_folder = '../course_temps/course_'+id
|
||||
|
||||
# list of each item, organized by item type. Tuples of (url,title)
|
||||
index = []
|
||||
try:
|
||||
os.mkdir(course_folder)
|
||||
|
|
@ -159,7 +293,7 @@ def accessible_check(id=""):
|
|||
|
||||
if os.path.exists(this_page_filename):
|
||||
d(" - already downloaded %s" % this_page_filename)
|
||||
this_page_content = open(this_page_filename,'r').read()
|
||||
this_page_content = codecs.open(this_page_filename,'r','utf-8').read()
|
||||
elif re.search(r'eis-prod',p['url']) or re.search(r'gavilan\.ins',p['url']):
|
||||
d(' * skipping file behind passwords')
|
||||
else:
|
||||
|
|
@ -167,31 +301,35 @@ def accessible_check(id=""):
|
|||
if t2 and 'body' in t2 and t2['body']:
|
||||
bb = bs(t2['body'],features="lxml")
|
||||
a_links = bb.find_all('a')
|
||||
for A in a_links:
|
||||
if re.search( r'youtu', A['href']):
|
||||
video_link_list.append( (A['href'], A.text, 'pages/'+easier_filename + ".html") )
|
||||
for A in a_links:
|
||||
href = A.get('href')
|
||||
|
||||
if href and re.search( r'youtu',href):
|
||||
video_link_list.append( (A.get('href'), A.text, 'pages/'+easier_filename + ".html") )
|
||||
|
||||
|
||||
page_images = bb.find_all('img')
|
||||
for I in page_images:
|
||||
d(' - %s' % I['src'])
|
||||
if re.search(r'eis-prod',I['src']) or re.search(r'gavilan\.ins',I['src']):
|
||||
d(' * skipping file behind passwords')
|
||||
else:
|
||||
try:
|
||||
r = requests.get(I['src'],headers=header, stream=True)
|
||||
mytype = r.headers['content-type']
|
||||
#print("Response is type: " + str(mytype))
|
||||
r_parts = mytype.split("/")
|
||||
ending = r_parts[-1]
|
||||
src = I.get('src')
|
||||
if src:
|
||||
d(' - %s' % src)
|
||||
if re.search(r'eis-prod', src) or re.search(r'gavilan\.ins', src):
|
||||
d(' * skipping file behind passwords')
|
||||
else:
|
||||
try:
|
||||
r = requests.get(src,headers=header, stream=True)
|
||||
mytype = r.headers['content-type']
|
||||
#print("Response is type: " + str(mytype))
|
||||
r_parts = mytype.split("/")
|
||||
ending = r_parts[-1]
|
||||
|
||||
with open(pages_f + '/' + str(image_count) + "." + ending, 'wb') as fd:
|
||||
for chunk in r.iter_content(chunk_size=128):
|
||||
fd.write(chunk)
|
||||
image_count += 1
|
||||
except Exception as e:
|
||||
d( ' * Error downloading page image, %s' % str(e) )
|
||||
|
||||
with open(pages_f + '/' + str(image_count) + "." + ending, 'wb') as fd:
|
||||
for chunk in r.iter_content(chunk_size=128):
|
||||
fd.write(chunk)
|
||||
image_count += 1
|
||||
except Exception as e:
|
||||
d( ' * Error downloading page image, %s' % str(e) )
|
||||
|
||||
try:
|
||||
with codecs.open(this_page_filename, 'w','utf-8') as fd:
|
||||
this_page_content = "<h2>%s</h2>\n%s" % ( t2['title'], t2['body'] )
|
||||
|
|
@ -249,63 +387,10 @@ def accessible_check(id=""):
|
|||
###
|
||||
### FORUMS
|
||||
###
|
||||
"""forum_f = course_folder + '/forums'
|
||||
headered = 0
|
||||
image_count = 0
|
||||
print("\nFORUMS")
|
||||
try:
|
||||
os.mkdir(forum_f)
|
||||
forums = fetch('/api/v1/courses/' + str(id) + '/discussion_topics', verbose)
|
||||
for p in forums:
|
||||
p['title'] = clean_title(p['title'])
|
||||
forum_id = p['id']
|
||||
easier_filename = p['title']
|
||||
for a in 'title,posted_at,published'.split(','):
|
||||
print(str(p[a]), "\t", end=' ')
|
||||
print("")
|
||||
t2 = fetch('/api/v1/courses/' + str(id) + '/discussion_topics/'+str(forum_id), verbose)
|
||||
|
||||
|
||||
#### REMOVED
|
||||
bb = bs(t2['body'],features="lxml")
|
||||
print("IMAGES IN THIS PAGE")
|
||||
page_images = bb.find_all('img')
|
||||
for I in page_images:
|
||||
r = requests.get(I['src'],headers=header, stream=True)
|
||||
mytype = r.headers['content-type']
|
||||
print("Response is type: " + str(mytype))
|
||||
r_parts = mytype.split("/")
|
||||
ending = r_parts[-1]
|
||||
|
||||
with open(pages_f + '/' + str(image_count) + "." + ending, 'wb') as fd:
|
||||
for chunk in r.iter_content(chunk_size=128):
|
||||
fd.write(chunk)
|
||||
image_count += 1
|
||||
#### END REMOVED
|
||||
|
||||
try:
|
||||
with codecs.open(forum_f + '/' + easier_filename + '.html', 'w','utf-8') as fd:
|
||||
fd.write("<h1>"+t2['title']+"</h1>\n")
|
||||
fd.write(t2['message'])
|
||||
if not headered: index.append( ('<br /><b>Discussion Forums</b><br />') )
|
||||
headered = 1
|
||||
index.append( ( 'forums/' + easier_filename + '.html', p['title'] ) )
|
||||
|
||||
# write to running log of content in order of module
|
||||
if p['id'] in item_id_to_index:
|
||||
items_inorder[ item_id_to_index[ p['id'] ] ] = '<h1>'+t2['title']+'</h1>\n\n'+t2['message']+'\n\n'+pagebreak
|
||||
else:
|
||||
print(' This forum didnt seem to be in the modules list.')
|
||||
except Exception as e:
|
||||
print("Error here:", e)
|
||||
#print p
|
||||
#print results_dict
|
||||
except Exception as e:
|
||||
print("** Forum folder seems to exist. Skipping those.")
|
||||
print(e)
|
||||
|
||||
|
||||
|
||||
index.extend( extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose) )
|
||||
|
||||
"""
|
||||
|
||||
|
||||
|
||||
|
|
@ -1286,6 +1371,7 @@ if __name__ == "__main__":
|
|||
2: ['download multiple classes', multiple_downloads ],
|
||||
3: ['convert stuff', pan_testing ],
|
||||
4: ['convert md to html', md_to_course ],
|
||||
5: ['course download tester', test_forums ],
|
||||
# 5: ['import freshdesk content', freshdesk ],
|
||||
6: ['download all a courses pages', grab_course_pages],
|
||||
7: ['demo vector search', demo_vector_search],
|
||||
|
|
|
|||
Loading…
Reference in New Issue