course downloading

2024-04-17 07:25:10 -07:00 · 2024-04-17 07:25:10 -07:00 · 1c3f9dbf10
parent 84f0a97529
commit 1c3f9dbf10
1 changed files with 171 additions and 85 deletions
--- a/content.py
+++ b/content.py
@ -14,24 +14,18 @@ from sentence_transformers import SentenceTransformer, util

 h = HTMLParser()

+pagebreak = '\n\n<!-- BREAK -->\n\n'
 DBG = 1

 def d(s):
    global DBG
    if DBG: print(s)

-
-
-# Download everything interesting in a course to a local folder
-# Build a master file with the entire class content
-def accessible_check(id=""):
+def test_forums(id=0):
    if not id:
        id = input("ID of course to check?  ")
-    pagebreak = '\n\n<!-- BREAK -->\n\n'
    verbose = 1
    
-    save_file_types = ['application/pdf','application/docx','image/jpg','image/png','image/gif','image/webp','application/vnd.openxmlformats-officedocument.wordprocessingml.document']
-
    courseinfo = fetch('/api/v1/courses/' + str(id), verbose )
    
    item_id_to_index = {}
@ -42,6 +36,144 @@ def accessible_check(id=""):
    
    items = []
    for x in range(9000): items.append(0)
+        
+    for m in modules:
+        items[running_index] = '<h2>%s</h2>%s\n' % ( m['name'], pagebreak )
+        running_index += 1
+        
+        mod_items = fetch('/api/v1/courses/' + str(id) + '/modules/'+str(m['id'])+'/items', verbose)
+        
+        for I in mod_items:
+            
+            if I['type'] in ['SubHeader', 'Page', 'Quiz', 'Discussion', 'ExternalUrl' ] or 'content_id' in I:
+                running_index += 1
+                
+                if I['type'] == 'SubHeader': 
+                    #print('subheader: ' + str(I))
+                    items[running_index] = '<h3>%s</h3>\n' % str(json.dumps(I,indent=2))
+                    
+                if I['type'] == 'Page': 
+                    item_id_to_index[ I['page_url'] ] = running_index
+                    
+                if I['type'] == 'Quiz':  
+                    item_id_to_index[ I['content_id'] ] = running_index
+                
+                if I['type'] == 'Discussion':  
+                    item_id_to_index[ I['content_id'] ] = running_index
+                
+                if I['type'] == 'ExternalUrl':
+                    items[running_index] = "<a href='%s'>%s</a><br />\n\n" % (I['external_url'], I['title'])
+                    
+                # ?
+                #if 'content_id' in I:
+                #    item_id_to_index[ I['content_id'] ] = running_index
+            else:
+                print("What is this item? " + str(I))
+                    
+            
+            #items_inorder.append('<i>Not included: '+ I['title'] + '(a ' + I['type'] + ')</i>\n\n\n' )
+            
+            # I['title']
+            # I['content_id']
+            # I['page_url']
+            # I['type']
+            # I['published']
+    # assignments and files have content_id, pages have page_url
+    
+    course_folder = '../course_temps/course_'+id
+    index = []
+    try:
+        os.mkdir(course_folder)
+    except:
+        print("Course folder exists.")
+
+    index.extend( extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose) )
+    print(json.dumps(index,indent=2))
+
+def write_message(fd, view, participants):
+    fd.write(f"<blockquote>\nfrom <b>{participants[view['user_id']]['display_name']}</b>:<br />\n{view['message']}\n<br />")
+    if 'replies' in view:
+        for r in view['replies']:
+            write_message(fd, r, participants)
+    fd.write("</blockquote>\n")
+
+def extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose=0):
+    ###
+    ### FORUMS
+    ###
+    index = []
+    forum_f = course_folder + '/forums'
+    headered = 0
+    image_count = 0
+    print("\nFORUMS")
+    try:
+        os.mkdir(forum_f)
+        forums = fetch('/api/v1/courses/' + str(id) + '/discussion_topics', verbose)
+        for p in forums:
+            p['title'] = clean_title(p['title'])
+            forum_id = p['id']
+            easier_filename = p['title']
+            for a in 'title,posted_at,published'.split(','):
+                print(str(p[a]), "\t", end=' ')
+            print("")
+            t2 = fetch(f"/api/v1/courses/{id}/discussion_topics/{forum_id}", verbose)
+            title = t2['title']
+            message = t2['message']
+                        
+            t2 = fetch(f"/api/v1/courses/{id}/discussion_topics/{forum_id}/view", verbose)
+            try:
+                participants = {x['id']:x for x in t2['participants']}
+                with codecs.open(forum_f + '/' + easier_filename + '.html', 'w','utf-8') as fd:
+                    fd.write(f"<h1>{title}</h1>\n")
+                    fd.write(message + "\n\n")
+                    for v in t2['view']:
+                        write_message(fd, v, participants)
+                if not headered: index.append( ('<br /><b>Discussion Forums</b><br />') )
+                headered = 1
+                index.append( ( 'forums/' + easier_filename + '.html', p['title'] ) )
+
+                
+                # write to running log of content in order of module
+                if p['id'] in item_id_to_index:
+                    items_inorder[  item_id_to_index[ p['id'] ]  ] = f"<h1>{title}</h1>\n\n{message}\n\n{pagebreak}"
+                else:
+                    print('  This forum didnt seem to be in the modules list.')
+            except Exception as e:
+                print("Error here:", e)
+                #print p
+                #print results_dict
+    except Exception as e:
+        print("** Forum folder seems to exist. Skipping those.")
+        print(e)
+
+    return index
+        
+        
+
+        
+# Download everything interesting in a course to a local folder
+# Build a master file with the entire class content
+def accessible_check(id=""):
+    if not id:
+        id = input("ID of course to check?  ")
+    verbose = 1
+    
+    save_file_types = ['application/pdf','application/docx','image/jpg','image/png','image/gif','image/webp','application/vnd.openxmlformats-officedocument.wordprocessingml.document']
+
+    courseinfo = fetch('/api/v1/courses/' + str(id), verbose )
+    
+    # reverse lookup into items array
+    item_id_to_index = {}
+
+    # is it used?
+    items_inorder = ["<font size='24'>" + courseinfo['name'] + "</font>\n\n" + pagebreak,]
+    running_index = 1
+    
+    modules = fetch('/api/v1/courses/' + str(id) + '/modules',verbose)
+    
+    # headers / module names
+    items = []
+    for x in range(9000): items.append(0)
    
    video_link_list = []
    
@ -89,6 +221,8 @@ def accessible_check(id=""):
    # assignments and files have content_id, pages have page_url
    
    course_folder = '../course_temps/course_'+id
+
+    # list of each item, organized by item type. Tuples of (url,title)
    index = []
    try:
        os.mkdir(course_folder)
@ -159,7 +293,7 @@ def accessible_check(id=""):

        if os.path.exists(this_page_filename):
            d(" - already downloaded %s" % this_page_filename)
-            this_page_content = open(this_page_filename,'r').read()
+            this_page_content = codecs.open(this_page_filename,'r','utf-8').read()
        elif re.search(r'eis-prod',p['url']) or re.search(r'gavilan\.ins',p['url']):
            d('   * skipping file behind passwords')
        else:    
@ -167,31 +301,35 @@ def accessible_check(id=""):
            if t2 and 'body' in t2 and t2['body']:
                bb = bs(t2['body'],features="lxml")
                a_links = bb.find_all('a')
-                for A in a_links:
-                    if re.search( r'youtu', A['href']):
-                        video_link_list.append( (A['href'], A.text, 'pages/'+easier_filename + ".html") )
+                for A in a_links: 
+                    href = A.get('href')
+
+                    if href and re.search( r'youtu',href):
+                        video_link_list.append( (A.get('href'), A.text, 'pages/'+easier_filename + ".html") )
                        

                page_images = bb.find_all('img') 
                for I in page_images:
-                    d('   - %s' % I['src'])
-                    if re.search(r'eis-prod',I['src']) or re.search(r'gavilan\.ins',I['src']):
-                        d('   * skipping file behind passwords')
-                    else:
-                        try:
-                            r = requests.get(I['src'],headers=header, stream=True)
-                            mytype = r.headers['content-type']
-                            #print("Response is type: " + str(mytype))
-                            r_parts = mytype.split("/")
-                            ending = r_parts[-1]
+                    src = I.get('src')
+                    if src:
+                        d('   - %s' % src)
+                        if re.search(r'eis-prod', src) or re.search(r'gavilan\.ins', src):
+                            d('   * skipping file behind passwords')
+                        else:
+                            try:
+                                r = requests.get(src,headers=header, stream=True)
+                                mytype = r.headers['content-type']
+                                #print("Response is type: " + str(mytype))
+                                r_parts = mytype.split("/")
+                                ending = r_parts[-1]

-                            with open(pages_f + '/' + str(image_count) + "." + ending, 'wb') as fd:
-                                for chunk in r.iter_content(chunk_size=128):
-                                    fd.write(chunk)
-                            image_count += 1
-                        except Exception as e:
-                            d( ' * Error downloading page image, %s' % str(e) )
-                
+                                with open(pages_f + '/' + str(image_count) + "." + ending, 'wb') as fd:
+                                    for chunk in r.iter_content(chunk_size=128):
+                                        fd.write(chunk)
+                                image_count += 1
+                            except Exception as e:
+                                d( ' * Error downloading page image, %s' % str(e) )
+                    
                try:
                    with codecs.open(this_page_filename, 'w','utf-8') as fd:
                        this_page_content = "<h2>%s</h2>\n%s" % ( t2['title'], t2['body'] )
@ -249,63 +387,10 @@ def accessible_check(id=""):
    ###
    ### FORUMS
    ###
-    """forum_f = course_folder + '/forums'
-    headered = 0
-    image_count = 0
-    print("\nFORUMS")
-    try:
-        os.mkdir(forum_f)
-        forums = fetch('/api/v1/courses/' + str(id) + '/discussion_topics', verbose)
-        for p in forums:
-            p['title'] = clean_title(p['title'])
-            forum_id = p['id']
-            easier_filename = p['title']
-            for a in 'title,posted_at,published'.split(','):
-                print(str(p[a]), "\t", end=' ')
-            print("")
-            t2 = fetch('/api/v1/courses/' + str(id) + '/discussion_topics/'+str(forum_id), verbose)
-            
-            
-            #### REMOVED
-            bb = bs(t2['body'],features="lxml")
-            print("IMAGES IN THIS PAGE")
-            page_images = bb.find_all('img') 
-            for I in page_images:
-                r = requests.get(I['src'],headers=header, stream=True)
-                mytype = r.headers['content-type']
-                print("Response is type: " + str(mytype))
-                r_parts = mytype.split("/")
-                ending = r_parts[-1]

-                with open(pages_f + '/' + str(image_count) + "." + ending, 'wb') as fd:
-                    for chunk in r.iter_content(chunk_size=128):
-                        fd.write(chunk)
-                image_count += 1
-            #### END REMOVED
-            
-            try:
-                with codecs.open(forum_f + '/' + easier_filename + '.html', 'w','utf-8') as fd:
-                    fd.write("<h1>"+t2['title']+"</h1>\n")
-                    fd.write(t2['message'])
-                if not headered: index.append( ('<br /><b>Discussion Forums</b><br />') )
-                headered = 1
-                index.append( ( 'forums/' + easier_filename + '.html', p['title'] ) )
-                
-                # write to running log of content in order of module
-                if p['id'] in item_id_to_index:
-                    items_inorder[  item_id_to_index[ p['id'] ]  ] = '<h1>'+t2['title']+'</h1>\n\n'+t2['message']+'\n\n'+pagebreak
-                else:
-                    print('  This forum didnt seem to be in the modules list.')
-            except Exception as e:
-                print("Error here:", e)
-                #print p
-                #print results_dict
-    except Exception as e:
-        print("** Forum folder seems to exist. Skipping those.")
-        print(e)
-        
-        
-        
+    index.extend( extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose) )
+
+    """        
    
    
    
@ -1286,6 +1371,7 @@ if __name__ == "__main__":
                2: ['download multiple classes', multiple_downloads ],
                3: ['convert stuff', pan_testing ],
                4: ['convert md to html', md_to_course ],
+                5: ['course download tester', test_forums ],
                # 5: ['import freshdesk content', freshdesk ],
                6: ['download all a courses pages', grab_course_pages],
                7: ['demo vector search', demo_vector_search],