course downloading

2024-04-17 07:25:10 -07:00 · 2024-04-17 07:25:10 -07:00 · 1c3f9dbf10
parent 84f0a97529
commit 1c3f9dbf10
1 changed files with 171 additions and 85 deletions
--- a/content.py
+++ b/content.py
@ -14,24 +14,18 @@ from sentence_transformers import SentenceTransformer, util
 h = HTMLParser()
 pagebreak = '\n\n<!-- BREAK -->\n\n'
 DBG = 1
 def d(s):
    global DBG
    if DBG: print(s)
-
+def test_forums(id=0):
 # Download everything interesting in a course to a local folder
 # Build a master file with the entire class content
 def accessible_check(id=""):
    if not id:
        id = input("ID of course to check?  ")
    pagebreak = '\n\n<!-- BREAK -->\n\n'
    verbose = 1
    save_file_types = ['application/pdf','application/docx','image/jpg','image/png','image/gif','image/webp','application/vnd.openxmlformats-officedocument.wordprocessingml.document']
    courseinfo = fetch('/api/v1/courses/' + str(id), verbose )
    item_id_to_index = {}
@ -43,6 +37,144 @@ def accessible_check(id=""):
    items = []
    for x in range(9000): items.append(0)
    for m in modules:
        items[running_index] = '<h2>%s</h2>%s\n' % ( m['name'], pagebreak )
        running_index += 1
        mod_items = fetch('/api/v1/courses/' + str(id) + '/modules/'+str(m['id'])+'/items', verbose)
        for I in mod_items:
            if I['type'] in ['SubHeader', 'Page', 'Quiz', 'Discussion', 'ExternalUrl' ] or 'content_id' in I:
                running_index += 1
                if I['type'] == 'SubHeader': 
                    #print('subheader: ' + str(I))
                    items[running_index] = '<h3>%s</h3>\n' % str(json.dumps(I,indent=2))
                if I['type'] == 'Page': 
                    item_id_to_index[ I['page_url'] ] = running_index
                if I['type'] == 'Quiz':  
                    item_id_to_index[ I['content_id'] ] = running_index
                if I['type'] == 'Discussion':  
                    item_id_to_index[ I['content_id'] ] = running_index
                if I['type'] == 'ExternalUrl':
                    items[running_index] = "<a href='%s'>%s</a><br />\n\n" % (I['external_url'], I['title'])
                # ?
                #if 'content_id' in I:
                #    item_id_to_index[ I['content_id'] ] = running_index
            else:
                print("What is this item? " + str(I))
            #items_inorder.append('<i>Not included: '+ I['title'] + '(a ' + I['type'] + ')</i>\n\n\n' )
            # I['title']
            # I['content_id']
            # I['page_url']
            # I['type']
            # I['published']
    # assignments and files have content_id, pages have page_url
    course_folder = '../course_temps/course_'+id
    index = []
    try:
        os.mkdir(course_folder)
    except:
        print("Course folder exists.")
    index.extend( extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose) )
    print(json.dumps(index,indent=2))
 def write_message(fd, view, participants):
    fd.write(f"<blockquote>\nfrom <b>{participants[view['user_id']]['display_name']}</b>:<br />\n{view['message']}\n<br />")
    if 'replies' in view:
        for r in view['replies']:
            write_message(fd, r, participants)
    fd.write("</blockquote>\n")
 def extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose=0):
    ###
    ### FORUMS
    ###
    index = []
    forum_f = course_folder + '/forums'
    headered = 0
    image_count = 0
    print("\nFORUMS")
    try:
        os.mkdir(forum_f)
        forums = fetch('/api/v1/courses/' + str(id) + '/discussion_topics', verbose)
        for p in forums:
            p['title'] = clean_title(p['title'])
            forum_id = p['id']
            easier_filename = p['title']
            for a in 'title,posted_at,published'.split(','):
                print(str(p[a]), "\t", end=' ')
            print("")
            t2 = fetch(f"/api/v1/courses/{id}/discussion_topics/{forum_id}", verbose)
            title = t2['title']
            message = t2['message']
            t2 = fetch(f"/api/v1/courses/{id}/discussion_topics/{forum_id}/view", verbose)
            try:
                participants = {x['id']:x for x in t2['participants']}
                with codecs.open(forum_f + '/' + easier_filename + '.html', 'w','utf-8') as fd:
                    fd.write(f"<h1>{title}</h1>\n")
                    fd.write(message + "\n\n")
                    for v in t2['view']:
                        write_message(fd, v, participants)
                if not headered: index.append( ('<br /><b>Discussion Forums</b><br />') )
                headered = 1
                index.append( ( 'forums/' + easier_filename + '.html', p['title'] ) )
                # write to running log of content in order of module
                if p['id'] in item_id_to_index:
                    items_inorder[  item_id_to_index[ p['id'] ]  ] = f"<h1>{title}</h1>\n\n{message}\n\n{pagebreak}"
                else:
                    print('  This forum didnt seem to be in the modules list.')
            except Exception as e:
                print("Error here:", e)
                #print p
                #print results_dict
    except Exception as e:
        print("** Forum folder seems to exist. Skipping those.")
        print(e)
    return index
 # Download everything interesting in a course to a local folder
 # Build a master file with the entire class content
 def accessible_check(id=""):
    if not id:
        id = input("ID of course to check?  ")
    verbose = 1
    save_file_types = ['application/pdf','application/docx','image/jpg','image/png','image/gif','image/webp','application/vnd.openxmlformats-officedocument.wordprocessingml.document']
    courseinfo = fetch('/api/v1/courses/' + str(id), verbose )
    # reverse lookup into items array
    item_id_to_index = {}
    # is it used?
    items_inorder = ["<font size='24'>" + courseinfo['name'] + "</font>\n\n" + pagebreak,]
    running_index = 1
    modules = fetch('/api/v1/courses/' + str(id) + '/modules',verbose)
    # headers / module names
    items = []
    for x in range(9000): items.append(0)
    video_link_list = []
    for m in modules:
@ -89,6 +221,8 @@ def accessible_check(id=""):
    # assignments and files have content_id, pages have page_url
    course_folder = '../course_temps/course_'+id
    # list of each item, organized by item type. Tuples of (url,title)
    index = []
    try:
        os.mkdir(course_folder)
@ -159,7 +293,7 @@ def accessible_check(id=""):
        if os.path.exists(this_page_filename):
            d(" - already downloaded %s" % this_page_filename)
-            this_page_content = open(this_page_filename,'r').read()
+            this_page_content = codecs.open(this_page_filename,'r','utf-8').read()
        elif re.search(r'eis-prod',p['url']) or re.search(r'gavilan\.ins',p['url']):
            d('   * skipping file behind passwords')
        else:    
@ -168,18 +302,22 @@ def accessible_check(id=""):
                bb = bs(t2['body'],features="lxml")
                a_links = bb.find_all('a')
                for A in a_links: 
-                    if re.search( r'youtu', A['href']):
+                    href = A.get('href')
-                        video_link_list.append( (A['href'], A.text, 'pages/'+easier_filename + ".html") )
+
                    if href and re.search( r'youtu',href):
                        video_link_list.append( (A.get('href'), A.text, 'pages/'+easier_filename + ".html") )
                page_images = bb.find_all('img') 
                for I in page_images:
-                    d('   - %s' % I['src'])
+                    src = I.get('src')
-                    if re.search(r'eis-prod',I['src']) or re.search(r'gavilan\.ins',I['src']):
+                    if src:
                        d('   - %s' % src)
                        if re.search(r'eis-prod', src) or re.search(r'gavilan\.ins', src):
                            d('   * skipping file behind passwords')
                        else:
                            try:
-                            r = requests.get(I['src'],headers=header, stream=True)
+                                r = requests.get(src,headers=header, stream=True)
                                mytype = r.headers['content-type']
                                #print("Response is type: " + str(mytype))
                                r_parts = mytype.split("/")
@ -249,63 +387,10 @@ def accessible_check(id=""):
    ###
    ### FORUMS
    ###
    """forum_f = course_folder + '/forums'
    headered = 0
    image_count = 0
    print("\nFORUMS")
    try:
        os.mkdir(forum_f)
        forums = fetch('/api/v1/courses/' + str(id) + '/discussion_topics', verbose)
        for p in forums:
            p['title'] = clean_title(p['title'])
            forum_id = p['id']
            easier_filename = p['title']
            for a in 'title,posted_at,published'.split(','):
                print(str(p[a]), "\t", end=' ')
            print("")
            t2 = fetch('/api/v1/courses/' + str(id) + '/discussion_topics/'+str(forum_id), verbose)
            #### REMOVED
            bb = bs(t2['body'],features="lxml")
            print("IMAGES IN THIS PAGE")
            page_images = bb.find_all('img') 
            for I in page_images:
                r = requests.get(I['src'],headers=header, stream=True)
                mytype = r.headers['content-type']
                print("Response is type: " + str(mytype))
                r_parts = mytype.split("/")
                ending = r_parts[-1]
                with open(pages_f + '/' + str(image_count) + "." + ending, 'wb') as fd:
                    for chunk in r.iter_content(chunk_size=128):
                        fd.write(chunk)
                image_count += 1
            #### END REMOVED
            try:
                with codecs.open(forum_f + '/' + easier_filename + '.html', 'w','utf-8') as fd:
                    fd.write("<h1>"+t2['title']+"</h1>\n")
                    fd.write(t2['message'])
                if not headered: index.append( ('<br /><b>Discussion Forums</b><br />') )
                headered = 1
                index.append( ( 'forums/' + easier_filename + '.html', p['title'] ) )
                # write to running log of content in order of module
                if p['id'] in item_id_to_index:
                    items_inorder[  item_id_to_index[ p['id'] ]  ] = '<h1>'+t2['title']+'</h1>\n\n'+t2['message']+'\n\n'+pagebreak
                else:
                    print('  This forum didnt seem to be in the modules list.')
            except Exception as e:
                print("Error here:", e)
                #print p
                #print results_dict
    except Exception as e:
        print("** Forum folder seems to exist. Skipping those.")
        print(e)
    index.extend( extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose) )
    """        
@ -1286,6 +1371,7 @@ if __name__ == "__main__":
                2: ['download multiple classes', multiple_downloads ],
                3: ['convert stuff', pan_testing ],
                4: ['convert md to html', md_to_course ],
                5: ['course download tester', test_forums ],
                # 5: ['import freshdesk content', freshdesk ],
                6: ['download all a courses pages', grab_course_pages],
                7: ['demo vector search', demo_vector_search],