diff --git a/content.py b/content.py
index e7f96bb..2f43590 100644
--- a/content.py
+++ b/content.py
@@ -14,24 +14,18 @@ from sentence_transformers import SentenceTransformer, util
h = HTMLParser()
+pagebreak = '\n\n\n\n'
DBG = 1
def d(s):
global DBG
if DBG: print(s)
-
-
-# Download everything interesting in a course to a local folder
-# Build a master file with the entire class content
-def accessible_check(id=""):
+def test_forums(id=0):
if not id:
id = input("ID of course to check? ")
- pagebreak = '\n\n\n\n'
verbose = 1
- save_file_types = ['application/pdf','application/docx','image/jpg','image/png','image/gif','image/webp','application/vnd.openxmlformats-officedocument.wordprocessingml.document']
-
courseinfo = fetch('/api/v1/courses/' + str(id), verbose )
item_id_to_index = {}
@@ -42,6 +36,144 @@ def accessible_check(id=""):
items = []
for x in range(9000): items.append(0)
+
+ for m in modules:
+ items[running_index] = '
%s
%s\n' % ( m['name'], pagebreak )
+ running_index += 1
+
+ mod_items = fetch('/api/v1/courses/' + str(id) + '/modules/'+str(m['id'])+'/items', verbose)
+
+ for I in mod_items:
+
+ if I['type'] in ['SubHeader', 'Page', 'Quiz', 'Discussion', 'ExternalUrl' ] or 'content_id' in I:
+ running_index += 1
+
+ if I['type'] == 'SubHeader':
+ #print('subheader: ' + str(I))
+ items[running_index] = '%s
\n' % str(json.dumps(I,indent=2))
+
+ if I['type'] == 'Page':
+ item_id_to_index[ I['page_url'] ] = running_index
+
+ if I['type'] == 'Quiz':
+ item_id_to_index[ I['content_id'] ] = running_index
+
+ if I['type'] == 'Discussion':
+ item_id_to_index[ I['content_id'] ] = running_index
+
+ if I['type'] == 'ExternalUrl':
+ items[running_index] = "%s
\n\n" % (I['external_url'], I['title'])
+
+ # ?
+ #if 'content_id' in I:
+ # item_id_to_index[ I['content_id'] ] = running_index
+ else:
+ print("What is this item? " + str(I))
+
+
+ #items_inorder.append('Not included: '+ I['title'] + '(a ' + I['type'] + ')\n\n\n' )
+
+ # I['title']
+ # I['content_id']
+ # I['page_url']
+ # I['type']
+ # I['published']
+ # assignments and files have content_id, pages have page_url
+
+ course_folder = '../course_temps/course_'+id
+ index = []
+ try:
+ os.mkdir(course_folder)
+ except:
+ print("Course folder exists.")
+
+ index.extend( extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose) )
+ print(json.dumps(index,indent=2))
+
+def write_message(fd, view, participants):
+ fd.write(f"\nfrom {participants[view['user_id']]['display_name']}:
\n{view['message']}\n
")
+ if 'replies' in view:
+ for r in view['replies']:
+ write_message(fd, r, participants)
+ fd.write("
\n")
+
+def extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose=0):
+ ###
+ ### FORUMS
+ ###
+ index = []
+ forum_f = course_folder + '/forums'
+ headered = 0
+ image_count = 0
+ print("\nFORUMS")
+ try:
+ os.mkdir(forum_f)
+ forums = fetch('/api/v1/courses/' + str(id) + '/discussion_topics', verbose)
+ for p in forums:
+ p['title'] = clean_title(p['title'])
+ forum_id = p['id']
+ easier_filename = p['title']
+ for a in 'title,posted_at,published'.split(','):
+ print(str(p[a]), "\t", end=' ')
+ print("")
+ t2 = fetch(f"/api/v1/courses/{id}/discussion_topics/{forum_id}", verbose)
+ title = t2['title']
+ message = t2['message']
+
+ t2 = fetch(f"/api/v1/courses/{id}/discussion_topics/{forum_id}/view", verbose)
+ try:
+ participants = {x['id']:x for x in t2['participants']}
+ with codecs.open(forum_f + '/' + easier_filename + '.html', 'w','utf-8') as fd:
+ fd.write(f"{title}
\n")
+ fd.write(message + "\n\n")
+ for v in t2['view']:
+ write_message(fd, v, participants)
+ if not headered: index.append( ('
Discussion Forums
') )
+ headered = 1
+ index.append( ( 'forums/' + easier_filename + '.html', p['title'] ) )
+
+
+ # write to running log of content in order of module
+ if p['id'] in item_id_to_index:
+ items_inorder[ item_id_to_index[ p['id'] ] ] = f"{title}
\n\n{message}\n\n{pagebreak}"
+ else:
+ print(' This forum didnt seem to be in the modules list.')
+ except Exception as e:
+ print("Error here:", e)
+ #print p
+ #print results_dict
+ except Exception as e:
+ print("** Forum folder seems to exist. Skipping those.")
+ print(e)
+
+ return index
+
+
+
+
+# Download everything interesting in a course to a local folder
+# Build a master file with the entire class content
+def accessible_check(id=""):
+ if not id:
+ id = input("ID of course to check? ")
+ verbose = 1
+
+ save_file_types = ['application/pdf','application/docx','image/jpg','image/png','image/gif','image/webp','application/vnd.openxmlformats-officedocument.wordprocessingml.document']
+
+ courseinfo = fetch('/api/v1/courses/' + str(id), verbose )
+
+ # reverse lookup into items array
+ item_id_to_index = {}
+
+ # is it used?
+ items_inorder = ["" + courseinfo['name'] + "\n\n" + pagebreak,]
+ running_index = 1
+
+ modules = fetch('/api/v1/courses/' + str(id) + '/modules',verbose)
+
+ # headers / module names
+ items = []
+ for x in range(9000): items.append(0)
video_link_list = []
@@ -89,6 +221,8 @@ def accessible_check(id=""):
# assignments and files have content_id, pages have page_url
course_folder = '../course_temps/course_'+id
+
+ # list of each item, organized by item type. Tuples of (url,title)
index = []
try:
os.mkdir(course_folder)
@@ -159,7 +293,7 @@ def accessible_check(id=""):
if os.path.exists(this_page_filename):
d(" - already downloaded %s" % this_page_filename)
- this_page_content = open(this_page_filename,'r').read()
+ this_page_content = codecs.open(this_page_filename,'r','utf-8').read()
elif re.search(r'eis-prod',p['url']) or re.search(r'gavilan\.ins',p['url']):
d(' * skipping file behind passwords')
else:
@@ -167,31 +301,35 @@ def accessible_check(id=""):
if t2 and 'body' in t2 and t2['body']:
bb = bs(t2['body'],features="lxml")
a_links = bb.find_all('a')
- for A in a_links:
- if re.search( r'youtu', A['href']):
- video_link_list.append( (A['href'], A.text, 'pages/'+easier_filename + ".html") )
+ for A in a_links:
+ href = A.get('href')
+
+ if href and re.search( r'youtu',href):
+ video_link_list.append( (A.get('href'), A.text, 'pages/'+easier_filename + ".html") )
page_images = bb.find_all('img')
for I in page_images:
- d(' - %s' % I['src'])
- if re.search(r'eis-prod',I['src']) or re.search(r'gavilan\.ins',I['src']):
- d(' * skipping file behind passwords')
- else:
- try:
- r = requests.get(I['src'],headers=header, stream=True)
- mytype = r.headers['content-type']
- #print("Response is type: " + str(mytype))
- r_parts = mytype.split("/")
- ending = r_parts[-1]
+ src = I.get('src')
+ if src:
+ d(' - %s' % src)
+ if re.search(r'eis-prod', src) or re.search(r'gavilan\.ins', src):
+ d(' * skipping file behind passwords')
+ else:
+ try:
+ r = requests.get(src,headers=header, stream=True)
+ mytype = r.headers['content-type']
+ #print("Response is type: " + str(mytype))
+ r_parts = mytype.split("/")
+ ending = r_parts[-1]
- with open(pages_f + '/' + str(image_count) + "." + ending, 'wb') as fd:
- for chunk in r.iter_content(chunk_size=128):
- fd.write(chunk)
- image_count += 1
- except Exception as e:
- d( ' * Error downloading page image, %s' % str(e) )
-
+ with open(pages_f + '/' + str(image_count) + "." + ending, 'wb') as fd:
+ for chunk in r.iter_content(chunk_size=128):
+ fd.write(chunk)
+ image_count += 1
+ except Exception as e:
+ d( ' * Error downloading page image, %s' % str(e) )
+
try:
with codecs.open(this_page_filename, 'w','utf-8') as fd:
this_page_content = "%s
\n%s" % ( t2['title'], t2['body'] )
@@ -249,63 +387,10 @@ def accessible_check(id=""):
###
### FORUMS
###
- """forum_f = course_folder + '/forums'
- headered = 0
- image_count = 0
- print("\nFORUMS")
- try:
- os.mkdir(forum_f)
- forums = fetch('/api/v1/courses/' + str(id) + '/discussion_topics', verbose)
- for p in forums:
- p['title'] = clean_title(p['title'])
- forum_id = p['id']
- easier_filename = p['title']
- for a in 'title,posted_at,published'.split(','):
- print(str(p[a]), "\t", end=' ')
- print("")
- t2 = fetch('/api/v1/courses/' + str(id) + '/discussion_topics/'+str(forum_id), verbose)
-
-
- #### REMOVED
- bb = bs(t2['body'],features="lxml")
- print("IMAGES IN THIS PAGE")
- page_images = bb.find_all('img')
- for I in page_images:
- r = requests.get(I['src'],headers=header, stream=True)
- mytype = r.headers['content-type']
- print("Response is type: " + str(mytype))
- r_parts = mytype.split("/")
- ending = r_parts[-1]
- with open(pages_f + '/' + str(image_count) + "." + ending, 'wb') as fd:
- for chunk in r.iter_content(chunk_size=128):
- fd.write(chunk)
- image_count += 1
- #### END REMOVED
-
- try:
- with codecs.open(forum_f + '/' + easier_filename + '.html', 'w','utf-8') as fd:
- fd.write(""+t2['title']+"
\n")
- fd.write(t2['message'])
- if not headered: index.append( ('
Discussion Forums
') )
- headered = 1
- index.append( ( 'forums/' + easier_filename + '.html', p['title'] ) )
-
- # write to running log of content in order of module
- if p['id'] in item_id_to_index:
- items_inorder[ item_id_to_index[ p['id'] ] ] = ''+t2['title']+'
\n\n'+t2['message']+'\n\n'+pagebreak
- else:
- print(' This forum didnt seem to be in the modules list.')
- except Exception as e:
- print("Error here:", e)
- #print p
- #print results_dict
- except Exception as e:
- print("** Forum folder seems to exist. Skipping those.")
- print(e)
-
-
-
+ index.extend( extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose) )
+
+ """
@@ -1286,6 +1371,7 @@ if __name__ == "__main__":
2: ['download multiple classes', multiple_downloads ],
3: ['convert stuff', pan_testing ],
4: ['convert md to html', md_to_course ],
+ 5: ['course download tester', test_forums ],
# 5: ['import freshdesk content', freshdesk ],
6: ['download all a courses pages', grab_course_pages],
7: ['demo vector search', demo_vector_search],