From 1c3f9dbf104740073ba4fa1b61a683a2a22e6d7b Mon Sep 17 00:00:00 2001 From: Coding with Peter Date: Wed, 17 Apr 2024 07:25:10 -0700 Subject: [PATCH] course downloading --- content.py | 256 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 171 insertions(+), 85 deletions(-) diff --git a/content.py b/content.py index e7f96bb..2f43590 100644 --- a/content.py +++ b/content.py @@ -14,24 +14,18 @@ from sentence_transformers import SentenceTransformer, util h = HTMLParser() +pagebreak = '\n\n\n\n' DBG = 1 def d(s): global DBG if DBG: print(s) - - -# Download everything interesting in a course to a local folder -# Build a master file with the entire class content -def accessible_check(id=""): +def test_forums(id=0): if not id: id = input("ID of course to check? ") - pagebreak = '\n\n\n\n' verbose = 1 - save_file_types = ['application/pdf','application/docx','image/jpg','image/png','image/gif','image/webp','application/vnd.openxmlformats-officedocument.wordprocessingml.document'] - courseinfo = fetch('/api/v1/courses/' + str(id), verbose ) item_id_to_index = {} @@ -42,6 +36,144 @@ def accessible_check(id=""): items = [] for x in range(9000): items.append(0) + + for m in modules: + items[running_index] = '

%s

%s\n' % ( m['name'], pagebreak ) + running_index += 1 + + mod_items = fetch('/api/v1/courses/' + str(id) + '/modules/'+str(m['id'])+'/items', verbose) + + for I in mod_items: + + if I['type'] in ['SubHeader', 'Page', 'Quiz', 'Discussion', 'ExternalUrl' ] or 'content_id' in I: + running_index += 1 + + if I['type'] == 'SubHeader': + #print('subheader: ' + str(I)) + items[running_index] = '

%s

\n' % str(json.dumps(I,indent=2)) + + if I['type'] == 'Page': + item_id_to_index[ I['page_url'] ] = running_index + + if I['type'] == 'Quiz': + item_id_to_index[ I['content_id'] ] = running_index + + if I['type'] == 'Discussion': + item_id_to_index[ I['content_id'] ] = running_index + + if I['type'] == 'ExternalUrl': + items[running_index] = "%s
\n\n" % (I['external_url'], I['title']) + + # ? + #if 'content_id' in I: + # item_id_to_index[ I['content_id'] ] = running_index + else: + print("What is this item? " + str(I)) + + + #items_inorder.append('Not included: '+ I['title'] + '(a ' + I['type'] + ')\n\n\n' ) + + # I['title'] + # I['content_id'] + # I['page_url'] + # I['type'] + # I['published'] + # assignments and files have content_id, pages have page_url + + course_folder = '../course_temps/course_'+id + index = [] + try: + os.mkdir(course_folder) + except: + print("Course folder exists.") + + index.extend( extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose) ) + print(json.dumps(index,indent=2)) + +def write_message(fd, view, participants): + fd.write(f"
\nfrom {participants[view['user_id']]['display_name']}:
\n{view['message']}\n
") + if 'replies' in view: + for r in view['replies']: + write_message(fd, r, participants) + fd.write("
\n") + +def extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose=0): + ### + ### FORUMS + ### + index = [] + forum_f = course_folder + '/forums' + headered = 0 + image_count = 0 + print("\nFORUMS") + try: + os.mkdir(forum_f) + forums = fetch('/api/v1/courses/' + str(id) + '/discussion_topics', verbose) + for p in forums: + p['title'] = clean_title(p['title']) + forum_id = p['id'] + easier_filename = p['title'] + for a in 'title,posted_at,published'.split(','): + print(str(p[a]), "\t", end=' ') + print("") + t2 = fetch(f"/api/v1/courses/{id}/discussion_topics/{forum_id}", verbose) + title = t2['title'] + message = t2['message'] + + t2 = fetch(f"/api/v1/courses/{id}/discussion_topics/{forum_id}/view", verbose) + try: + participants = {x['id']:x for x in t2['participants']} + with codecs.open(forum_f + '/' + easier_filename + '.html', 'w','utf-8') as fd: + fd.write(f"

{title}

\n") + fd.write(message + "\n\n") + for v in t2['view']: + write_message(fd, v, participants) + if not headered: index.append( ('
Discussion Forums
') ) + headered = 1 + index.append( ( 'forums/' + easier_filename + '.html', p['title'] ) ) + + + # write to running log of content in order of module + if p['id'] in item_id_to_index: + items_inorder[ item_id_to_index[ p['id'] ] ] = f"

{title}

\n\n{message}\n\n{pagebreak}" + else: + print(' This forum didnt seem to be in the modules list.') + except Exception as e: + print("Error here:", e) + #print p + #print results_dict + except Exception as e: + print("** Forum folder seems to exist. Skipping those.") + print(e) + + return index + + + + +# Download everything interesting in a course to a local folder +# Build a master file with the entire class content +def accessible_check(id=""): + if not id: + id = input("ID of course to check? ") + verbose = 1 + + save_file_types = ['application/pdf','application/docx','image/jpg','image/png','image/gif','image/webp','application/vnd.openxmlformats-officedocument.wordprocessingml.document'] + + courseinfo = fetch('/api/v1/courses/' + str(id), verbose ) + + # reverse lookup into items array + item_id_to_index = {} + + # is it used? + items_inorder = ["" + courseinfo['name'] + "\n\n" + pagebreak,] + running_index = 1 + + modules = fetch('/api/v1/courses/' + str(id) + '/modules',verbose) + + # headers / module names + items = [] + for x in range(9000): items.append(0) video_link_list = [] @@ -89,6 +221,8 @@ def accessible_check(id=""): # assignments and files have content_id, pages have page_url course_folder = '../course_temps/course_'+id + + # list of each item, organized by item type. Tuples of (url,title) index = [] try: os.mkdir(course_folder) @@ -159,7 +293,7 @@ def accessible_check(id=""): if os.path.exists(this_page_filename): d(" - already downloaded %s" % this_page_filename) - this_page_content = open(this_page_filename,'r').read() + this_page_content = codecs.open(this_page_filename,'r','utf-8').read() elif re.search(r'eis-prod',p['url']) or re.search(r'gavilan\.ins',p['url']): d(' * skipping file behind passwords') else: @@ -167,31 +301,35 @@ def accessible_check(id=""): if t2 and 'body' in t2 and t2['body']: bb = bs(t2['body'],features="lxml") a_links = bb.find_all('a') - for A in a_links: - if re.search( r'youtu', A['href']): - video_link_list.append( (A['href'], A.text, 'pages/'+easier_filename + ".html") ) + for A in a_links: + href = A.get('href') + + if href and re.search( r'youtu',href): + video_link_list.append( (A.get('href'), A.text, 'pages/'+easier_filename + ".html") ) page_images = bb.find_all('img') for I in page_images: - d(' - %s' % I['src']) - if re.search(r'eis-prod',I['src']) or re.search(r'gavilan\.ins',I['src']): - d(' * skipping file behind passwords') - else: - try: - r = requests.get(I['src'],headers=header, stream=True) - mytype = r.headers['content-type'] - #print("Response is type: " + str(mytype)) - r_parts = mytype.split("/") - ending = r_parts[-1] + src = I.get('src') + if src: + d(' - %s' % src) + if re.search(r'eis-prod', src) or re.search(r'gavilan\.ins', src): + d(' * skipping file behind passwords') + else: + try: + r = requests.get(src,headers=header, stream=True) + mytype = r.headers['content-type'] + #print("Response is type: " + str(mytype)) + r_parts = mytype.split("/") + ending = r_parts[-1] - with open(pages_f + '/' + str(image_count) + "." + ending, 'wb') as fd: - for chunk in r.iter_content(chunk_size=128): - fd.write(chunk) - image_count += 1 - except Exception as e: - d( ' * Error downloading page image, %s' % str(e) ) - + with open(pages_f + '/' + str(image_count) + "." + ending, 'wb') as fd: + for chunk in r.iter_content(chunk_size=128): + fd.write(chunk) + image_count += 1 + except Exception as e: + d( ' * Error downloading page image, %s' % str(e) ) + try: with codecs.open(this_page_filename, 'w','utf-8') as fd: this_page_content = "

%s

\n%s" % ( t2['title'], t2['body'] ) @@ -249,63 +387,10 @@ def accessible_check(id=""): ### ### FORUMS ### - """forum_f = course_folder + '/forums' - headered = 0 - image_count = 0 - print("\nFORUMS") - try: - os.mkdir(forum_f) - forums = fetch('/api/v1/courses/' + str(id) + '/discussion_topics', verbose) - for p in forums: - p['title'] = clean_title(p['title']) - forum_id = p['id'] - easier_filename = p['title'] - for a in 'title,posted_at,published'.split(','): - print(str(p[a]), "\t", end=' ') - print("") - t2 = fetch('/api/v1/courses/' + str(id) + '/discussion_topics/'+str(forum_id), verbose) - - - #### REMOVED - bb = bs(t2['body'],features="lxml") - print("IMAGES IN THIS PAGE") - page_images = bb.find_all('img') - for I in page_images: - r = requests.get(I['src'],headers=header, stream=True) - mytype = r.headers['content-type'] - print("Response is type: " + str(mytype)) - r_parts = mytype.split("/") - ending = r_parts[-1] - with open(pages_f + '/' + str(image_count) + "." + ending, 'wb') as fd: - for chunk in r.iter_content(chunk_size=128): - fd.write(chunk) - image_count += 1 - #### END REMOVED - - try: - with codecs.open(forum_f + '/' + easier_filename + '.html', 'w','utf-8') as fd: - fd.write("

"+t2['title']+"

\n") - fd.write(t2['message']) - if not headered: index.append( ('
Discussion Forums
') ) - headered = 1 - index.append( ( 'forums/' + easier_filename + '.html', p['title'] ) ) - - # write to running log of content in order of module - if p['id'] in item_id_to_index: - items_inorder[ item_id_to_index[ p['id'] ] ] = '

'+t2['title']+'

\n\n'+t2['message']+'\n\n'+pagebreak - else: - print(' This forum didnt seem to be in the modules list.') - except Exception as e: - print("Error here:", e) - #print p - #print results_dict - except Exception as e: - print("** Forum folder seems to exist. Skipping those.") - print(e) - - - + index.extend( extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose) ) + + """ @@ -1286,6 +1371,7 @@ if __name__ == "__main__": 2: ['download multiple classes', multiple_downloads ], 3: ['convert stuff', pan_testing ], 4: ['convert md to html', md_to_course ], + 5: ['course download tester', test_forums ], # 5: ['import freshdesk content', freshdesk ], 6: ['download all a courses pages', grab_course_pages], 7: ['demo vector search', demo_vector_search],