canvasapp/content.py

from __future__ import annotations

#saved_titles = json.loads( codecs.open('cache/saved_youtube_titles.json','r','utf-8').read() )

#from calendar import FRIDAY
#import html2markdown as h2m

from typing import ItemsView
import requests, codecs, os, re, json, sys, pypandoc, mimetypes, hashlib
from checker import safe_html
from pipelines import header, fetch, url
from util import clean_title, to_file_friendly
from urllib.parse import quote, urljoin, urlparse
from bs4 import BeautifulSoup as bs
from html.parser import HTMLParser
from datetime import datetime, timezone


pagebreak = '\n\n<!-- BREAK -->\n\n<div style="page-break-before: always;"></div>\n\n'
DBG = 1

items = []

def d(s):
    global DBG
    if DBG: print(s)

def test_forums(id=0):
    if not id:
        id = input("ID of course to check?  ")
    verbose = 1

    courseinfo = fetch('/api/v1/courses/' + str(id), verbose )

    item_id_to_index = {}
    items_inorder = ["<font size='24'>" + courseinfo['name'] + "</font>\n\n" + pagebreak,]
    running_index = 1

    modules = fetch('/api/v1/courses/' + str(id) + '/modules',verbose)

    items = []
    for x in range(9000): items.append(0)

    for m in modules:
        items[running_index] = '<h2>%s</h2>%s\n' % ( m['name'], pagebreak )
        running_index += 1
        mod_items = fetch('/api/v1/courses/' + str(id) + '/modules/'+str(m['id'])+'/items', verbose)

        for I in mod_items:

            if I['type'] in ['SubHeader', 'Page', 'Quiz', 'Discussion', 'ExternalUrl' ] or 'content_id' in I:
                running_index += 1

                if I['type'] == 'SubHeader':
                    #print('subheader: ' + str(I))
                    items[running_index] = '<h3>%s</h3>\n' % str(json.dumps(I,indent=2))

                if I['type'] == 'Page':
                    item_id_to_index[ I['page_url'] ] = running_index

                if I['type'] == 'Quiz':
                    item_id_to_index[ I['content_id'] ] = running_index

                if I['type'] == 'Discussion':
                    item_id_to_index[ I['content_id'] ] = running_index

                if I['type'] == 'ExternalUrl':
                    items[running_index] = "<a href='%s'>%s</a><br />\n\n" % (I['external_url'], I['title'])

                # ?
                #if 'content_id' in I:
                #    item_id_to_index[ I['content_id'] ] = running_index
            else:
                print("What is this item? " + str(I))


            #items_inorder.append('<i>Not included: '+ I['title'] + '(a ' + I['type'] + ')</i>\n\n\n' )

            # I['title']
            # I['content_id']
            # I['page_url']
            # I['type']
            # I['published']
    # assignments and files have content_id, pages have page_url

    course_folder = '../course_temps/course_'+id
    index = []
    try:
        os.mkdir(course_folder)
    except:
        print("Course folder exists.")

    index.extend( extract_forums(id, course_folder, item_id_to_index, verbose) )
    print(json.dumps(index,indent=2))

def write_message(fd, view, participants):
    fd.write(f"<blockquote>\nfrom <b>{participants[view['user_id']]['display_name']}</b>:<br />\n{view['message']}\n<br />")
    if 'replies' in view:
        for r in view['replies']:
            write_message(fd, r, participants)
    fd.write("</blockquote>\n")

def extract_forums(id, course_folder, item_id_to_index, verbose=0, discussion_link_map=None):
    ###
    ### FORUMS
    ###

    global items

    index = []
    forum_f = course_folder + '/forums'
    headered = 0
    print("\nFORUMS")
    try:
        os.mkdir(forum_f)
        forums = fetch('/api/v1/courses/' + str(id) + '/discussion_topics', verbose)
        for p in forums:
            p['title'] = clean_title(p['title'])
            forum_id = p['id']
            easier_filename = p['title']
            for a in 'title,posted_at,published'.split(','):
                print(str(p[a]), "\t", end=' ')
            print("")
            t2 = fetch(f"/api/v1/courses/{id}/discussion_topics/{forum_id}", verbose)
            title = t2['title']
            message = t2['message']

            t2 = fetch(f"/api/v1/courses/{id}/discussion_topics/{forum_id}/view", verbose)
            try:
                participants = {x['id']:x for x in t2['participants']}
                with codecs.open(forum_f + '/' + easier_filename + '.html', 'w','utf-8') as fd:
                    fd.write(f"<h1>{title}</h1>\n")
                    fd.write(message + "\n\n")
                    for v in t2['view']:
                        write_message(fd, v, participants)
                if discussion_link_map is not None:
                    discussion_link_map[p['id']] = f"forums/{easier_filename}.html"
                if not headered: index.append( ('<br /><b>Discussion Forums</b><br />') )
                headered = 1
                index.append( ( 'forums/' + easier_filename + '.html', p['title'] ) )


                # write to running log of content in order of module
                if p['id'] in item_id_to_index:
                    items[  item_id_to_index[ p['id'] ]  ] = f"<h1>{title}</h1>\n\n{message}\n\n{pagebreak}"
                else:
                    print('  This forum didnt seem to be in the modules list.')
            except Exception as e:
                print("Error here:", e)
                #print p
                #print results_dict
    except Exception as e:
        print("** Forum folder seems to exist. Skipping those.")
        print(e)

    return index


#
#
#
#
#
# todo: include front page.
# todo: clean html
# todo: toc
#
#
# Download everything interesting in a course to a local folder
# Build a master file with the entire class content
# Adjust image paths in aggregated snippets so they work from the course root.
def adjust_fullcourse_image_sources(html_fragment):
    if not html_fragment:
        return html_fragment

    def _prefix_images(match):
        prefix = match.group(1)
        path = match.group(2)
        normalized = path.lstrip('./')
        if normalized.lower().startswith('pages/'):
            return f"{prefix}{normalized}"
        return f"{prefix}pages/{normalized}"

    src_pattern = re.compile(r'(<img[^>]+?\bsrc\s*=\s*[\'"])(?:\./)?(images/[^\'"]*)', re.IGNORECASE)
    html_fragment = src_pattern.sub(_prefix_images, html_fragment)

    canvas_pattern = re.compile(r'(<img[^>]+?\bdata-canvas-src\s*=\s*[\'"])(?:\./)?(images/[^\'"]*)', re.IGNORECASE)
    html_fragment = canvas_pattern.sub(_prefix_images, html_fragment)

    srcset_pattern = re.compile(r'(<img[^>]+?\bsrcset\s*=\s*[\'"])([^\'"]*)([\'"])', re.IGNORECASE | re.DOTALL)

    def _prefix_srcset(match):
        prefix = match.group(1)
        value = match.group(2)
        suffix = match.group(3)
        entries = []
        changed = False
        for chunk in value.split(','):
            chunk = chunk.strip()
            if not chunk:
                continue
            parts = chunk.split()
            url = parts[0]
            descriptors = parts[1:]
            normalized = url.lstrip('./')
            if normalized.lower().startswith('pages/'):
                new_url = url
            elif normalized.lower().startswith('images/'):
                new_url = f"pages/{normalized}"
                changed = True
            else:
                new_url = url
            descriptor_text = ' '.join(descriptors)
            entry = f"{new_url} {descriptor_text}".strip()
            entries.append(entry)
        if not changed:
            return match.group(0)
        return f"{prefix}{', '.join(entries)}{suffix}"

    html_fragment = srcset_pattern.sub(_prefix_srcset, html_fragment)

    return html_fragment

def course_download(id=""):
    global items

    if not id:
        id = input("ID of course to check?  ")
        # temp hard code
        #id = "21284"

    verbose = 0
    PAGES_ONLY = 0

    videos_log = codecs.open('cache/accessible_check_log.txt','w','utf-8')

    save_file_types = ['application/pdf','application/docx','image/jpg','image/png','image/gif','image/webp','application/vnd.openxmlformats-officedocument.wordprocessingml.document']

    courseinfo = fetch('/api/v1/courses/' + str(id), verbose )

    # reverse lookup into items array
    item_id_to_index = {}


    modules = fetch('/api/v1/courses/' + str(id) + '/modules',verbose)

    # headers / module names
    items = [f"<h1>{courseinfo['name']}</h1>\n{pagebreak}",]
    running_index = 1
    for x in range(9000): items.append(0)

    video_link_list = []
    page_local_map = {}
    assignment_local_map = {}
    file_local_map = {}
    discussion_local_map = {}
    module_details = []
    canvas_host = urlparse(url).hostname if url else None

    for m in modules:
        items[running_index] = '<h2>%s</h2>%s\n' % ( m['name'], pagebreak )
        running_index += 1

        mod_items = fetch('/api/v1/courses/' + str(id) + '/modules/'+str(m['id'])+'/items', verbose)
        module_entry = {'name': m['name'], 'items': []}

        for I in mod_items:

            if I['type'] in ['SubHeader', 'Page', 'Quiz', 'Discussion', 'ExternalUrl' ] or 'content_id' in I:
                running_index += 1

                if I['type'] == 'SubHeader':
                    #print('subheader: ' + str(I))
                    items[running_index] = f"<h3>{I['title']}</h3>\n"

                if I['type'] == 'Page':
                    item_id_to_index[ I['page_url'] ] = running_index

                if I['type'] == 'Quiz':
                    item_id_to_index[ I['content_id'] ] = running_index

                if I['type'] == 'Discussion':
                    item_id_to_index[ I['content_id'] ] = running_index

                if I['type'] == 'ExternalUrl':
                    items[running_index] = "<a href='%s'>%s</a><br />\n\n" % (I['external_url'], I['title'])

                # ?
                #if 'content_id' in I:
                #    item_id_to_index[ I['content_id'] ] = running_index
            else:
                print("What is this item? " + str(I))


            #items_inorder.append('<i>Not included: '+ I['title'] + '(a ' + I['type'] + ')</i>\n\n\n' )

            # I['title']
            # I['content_id']
            # I['page_url']
            # I['type']
            # I['published']
            module_entry['items'].append({
                'type': I.get('type'),
                'title': I.get('title'),
                'page_url': I.get('page_url'),
                'content_id': I.get('content_id'),
                'html_url': I.get('html_url'),
                'url': I.get('url'),
                'external_url': I.get('external_url'),
                'id': I.get('id')
            })
        module_details.append(module_entry)
    # assignments and files have content_id, pages have page_url

    course_folder = '../course_temps/course_'+id

    # list of each item, organized by item type. Tuples of (url,title)
    index = []
    try:
        os.mkdir(course_folder)
    except:
        print("Course folder exists.")
    ###
    ### FILES
    ###
    if not PAGES_ONLY:
        files_f = course_folder + '/files'
        headered = 0
        print("\nFILES")
        try:
            os.mkdir(files_f)
        except:
            print(" * Files folder already exists.")

        files = fetch('/api/v1/courses/' + str(id) + '/files', verbose)
        print("LISTING COURSE FILES")
        for f in files:
            for arg in 'filename,content-type,size,url'.split(','):
                if arg=='size':
                    f['size'] = str(int(f['size']) / 1000) + 'k'

            if f['content-type'] in save_file_types:
                d(' - %s' % f['filename'])

                if not os.path.exists(files_f + '/' + f['filename']):
                    r = requests.get(f['url'],headers=header, stream=True)
                    with open(files_f + '/' + f['filename'], 'wb') as fd:
                        for chunk in r.iter_content(chunk_size=128):
                            fd.write(chunk)
                else:
                    d(" - already downloaded %s" % files_f + '/' + f['filename'])

                if not headered:
                    index.append( ('<br /><b>Files</b><br />') )
                    headered = 1
                relative_path = 'files/' + f['filename']
                index.append( (relative_path, f['filename']) )
                file_local_map[f['id']] = relative_path

    ###
    ### PAGES
    ###
    pages_f = course_folder + '/pages'
    headered = 0
    images_f = os.path.join(pages_f, 'images')
    try:
        os.makedirs(images_f)
    except FileExistsError:
        pass
    except Exception as e:
        print(f" * Unable to ensure images folder: {e}")

    image_map = {}
    image_counter = 0

    def ensure_local_image(src, canvas_override=None):
        nonlocal image_counter
        if not src:
            return (None, None)
        original_src = src
        if src.startswith('data:'):
            return (None, None)
        if src.startswith('images/'):
            full_rel = f"pages/{src}"
            image_map.setdefault(original_src, (src, full_rel))
            return image_map[original_src], canvas_override
        if src.startswith('pages/'):
            page_rel = src.split('pages/', 1)[-1]
            page_rel = page_rel if page_rel else src
            full_rel = src
            image_map.setdefault(original_src, (page_rel, full_rel))
            return image_map[original_src], canvas_override

        mapped = image_map.get(original_src)
        if mapped:
            return mapped, canvas_override or original_src

        absolute_src = src
        if not absolute_src.lower().startswith('http'):
            absolute_src = urljoin(url, absolute_src)

        mapped = image_map.get(absolute_src)
        if mapped:
            image_map[original_src] = mapped
            return mapped, canvas_override or absolute_src

        try:
            target_host = urlparse(absolute_src).hostname
            request_headers = header if not canvas_host or target_host == canvas_host else None
            response = requests.get(absolute_src, headers=request_headers, stream=True, timeout=30)
            response.raise_for_status()
        except Exception as e:
            d(f"   * error downloading image {absolute_src}: {e}")
            return (None, canvas_override or absolute_src)

        content_type = response.headers.get('content-type', '').split(';')[0]
        ext = ''
        if content_type:
            guessed = mimetypes.guess_extension(content_type)
            if guessed:
                ext = guessed
        if not ext:
            ext = os.path.splitext(urlparse(absolute_src).path)[1]
        if not ext:
            ext = '.bin'
        ext = ext.lstrip('.')

        local_name = f"img_{image_counter}.{ext}"
        image_counter += 1
        local_path = os.path.join(images_f, local_name)

        try:
            with open(local_path, 'wb') as fd:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        fd.write(chunk)
        except Exception as e:
            d(f"   * error saving image {absolute_src}: {e}")
            return (None, canvas_override or absolute_src)

        page_rel = f"images/{local_name}"
        full_rel = f"pages/{page_rel}"
        image_map[original_src] = (page_rel, full_rel)
        if absolute_src != original_src:
            image_map[absolute_src] = image_map[original_src]
        return image_map[original_src], canvas_override or absolute_src
    print("\nPAGES")
    try:
        os.mkdir(pages_f)
    except:
        print(" * Pages folder already exists.")


    page_manifest = {
        'course_id': str(id),
        'generated_at': datetime.now(timezone.utc).isoformat(),
        'pages': {}
    }

    pages = fetch('/api/v1/courses/' + str(id) + '/pages', verbose)
    for p in pages:
        d(' - %s' % p['title'])

        p['title'] = clean_title(p['title'])
        easier_filename = clean_title(p['url'])
        this_page_filename = "%s/%s.html" % (pages_f, easier_filename)
        #for a in 'title,updated_at,published'.split(','):
        #    print(str(p[a]), "\t", end=' ')

        if not headered:
            index.append( ('<br /><b>Pages</b><br />') )
            headered = 1
        index.append( ( 'pages/' + easier_filename + '.html', p['title'] ) )


        t2 = {'title': p['title']}
        soup_infolder = None
        soup_in_main = None
        page_local_map[p['url']] = f"pages/{easier_filename}.html"
        this_page_content = None

        fetched_page = fetch('/api/v1/courses/' + str(id) + '/pages/'+p['url'], verbose)
        if fetched_page and fetched_page.get('body'):
            t2 = fetched_page
            soup_infolder = bs(t2['body'], features="lxml")
            soup_in_main = bs(t2['body'], features="lxml")
        elif os.path.exists(this_page_filename):
            d(" - already downloaded %s" % this_page_filename)
            this_page_content = codecs.open(this_page_filename,'r','utf-8').read()
            soup_infolder = bs(this_page_content, features="lxml")
            soup_in_main = bs(this_page_content, features="lxml")
        else:
            d('   * nothing returned or bad fetch')
            continue

        page_title = (t2.get('title') or p['title']).strip() if isinstance(t2, dict) else p['title']

        def strip_leading_heading(soup):
            if not soup:
                return
            first_heading = soup.find(['h1', 'h2'])
            if first_heading and first_heading.get_text(strip=True) == page_title:
                first_heading.decompose()

        strip_leading_heading(soup_infolder)
        strip_leading_heading(soup_in_main)

        a_links = soup_infolder.find_all('a')
        for A in a_links:
            href = A.get('href')
            if href and re.search(r'youtu', href):
                video_link_list.append((A.get('href'), A.text, 'pages/' + easier_filename + ".html"))

        # Images -> ensure local copies
        for img in soup_infolder.find_all('img'):
            mapping, canvas_src = ensure_local_image(img.get('src'), img.get('data-canvas-src'))
            if mapping:
                img['src'] = mapping[0]
                if canvas_src:
                    img['data-canvas-src'] = canvas_src

        for img in soup_in_main.find_all('img'):
            mapping, canvas_src = ensure_local_image(img.get('src'), img.get('data-canvas-src'))
            if mapping:
                img['src'] = mapping[1]
                if canvas_src:
                    img['data-canvas-src'] = canvas_src

        # STUDIO VIDEOS
        pattern = r"custom_arc_media_id%3D([^&]+)"
        for iframe in soup_infolder.find_all("iframe"):
            src = iframe.get("src")
            if not src:
                continue
            match = re.search(pattern, src)
            if match:
                videos_log.write(f"page: {p['url']}  arc id: {match.group(1)}\n")
                videos_log.flush()
            videos_log.write(f"page: {p['url']}  iframe src: {src}\n")
            videos_log.flush()
            if 'instructuremedia.com' in src:
                try:
                    iframe_response = requests.get(src, timeout=15)
                    iframe_response.raise_for_status()
                except Exception as e:
                    print(f"Failed to retrieve iframe content from: {src} ({e})")
                    continue
                videos_log.write(f"succesfully fetched {src}\n")
                videos_log.flush()
                iframe_soup = bs(iframe_response.text, 'html.parser')
                for source_tag in iframe_soup.find_all('source'):
                    videos_log.write(f"page: {p['url']} video src: {source_tag.get('src')}\n")
                    videos_log.flush()

        # WRITE out page (always refresh to ensure local paths)
        try:
            this_page_content = f"<h2>{t2['title']}</h2>\n{soup_infolder.prettify()}"
            with codecs.open(this_page_filename, 'w','utf-8') as fd:
                fd.write(this_page_content)
        except Exception as e:
            d(f' * problem writing page content: {e}')

        # write to running log of content in order of module
        if p and p['url'] in item_id_to_index and soup_in_main:
            items[item_id_to_index[p['url']]] = f"<h2>{t2['title']}</h2>\n{soup_in_main.prettify()}\n{pagebreak}"
        else:
            d(' -- This page didnt seem to be in the modules list.')

        if this_page_content is not None:
            page_hash = hashlib.sha256(this_page_content.encode('utf-8')).hexdigest()
            page_manifest['pages'][p['url']] = {
                'title': t2.get('title') or p['title'],
                'filename': f"pages/{easier_filename}.html",
                'hash': page_hash
            }

    manifest_path = os.path.join(course_folder, 'pages_manifest.json')
    with codecs.open(manifest_path, 'w', 'utf-8') as manifest_file:
        manifest_file.write(json.dumps(page_manifest, indent=2))

    ###
    ### ASSIGNMENTS
    ###

    if not PAGES_ONLY:
        headered = 0
        asm_f = course_folder + '/assignments'
        print("\nASSIGNMENTS")
        try:
            os.mkdir(asm_f)
        except:
            d(" - Assignments dir exists")

        asm = fetch('/api/v1/courses/' + str(id) + '/assignments', verbose)
        for p in asm:
            d(' - %s' % p['name'])


            try:
                friendlyfile = to_file_friendly(p['name'])
                this_assmt_filename = asm_f + '/' + str(p['id'])+"_"+ friendlyfile + '.html'
                assignment_local_map[p['id']] = 'assignments/' + str(p['id'])+"_"+ friendlyfile + '.html'
                if os.path.exists(this_assmt_filename):
                    d(" - already downloaded %s" % this_assmt_filename)
                    this_assmt_content = open(this_assmt_filename,'r').read()
                else:
                    t2 = fetch('/api/v1/courses/' + str(id) + '/assignments/'+str(p['id']), verbose)
                    with codecs.open(this_assmt_filename, 'w','utf-8') as fd:
                        this_assmt_content = "<h2>%s</h2>\n%s\n\n" % (t2['name'], t2['description'])
                        fd.write(this_assmt_content)
                    if not headered:
                        index.append( ('<br /><b>Assignments</b><br />') )
                        headered = 1
                    index.append( ('assignments/' + str(p['id'])+"_"+friendlyfile + '.html', p['name']) )

                # write to running log of content in order of module
                if p['id'] in item_id_to_index:
                    items[  item_id_to_index[ p['url'] ]  ] = this_assmt_content+'\n\n'+pagebreak
            except Exception as e:
                d(' * Problem %s' % str(e))

        ###
        ### FORUMS
        ###

        index.extend( extract_forums(id, course_folder, item_id_to_index, verbose, discussion_local_map) )

        """


        ###
        ### QUIZZES
        ###


        # get a list external urls
        headered = 0
        t = url + '/api/v1/courses/' + str(id) + '/modules'
        while t: t = fetch(t)
        mods = results
        results = []
        for m in mods:
            results = []
            t2 = url + '/api/v1/courses/' + str(id) + '/modules/' + str(m['id']) + '/items'
            while t2: t2 = fetch(t2)
            items = results
            for i in items:
                #print i
                if i['type'] == "ExternalUrl":
                    #print i
                    for j in 'id,title,external_url'.split(','):
                        print unicode(i[j]), "\t",
                    print ""
                    if not headered: index.append( ('<br /><b>External Links</b><br />') )
                    headered = 1
                    index.append( (i['external_url'], i['title']) )
        """


    # Create index page of all gathered items
    index.insert(0, ('modules.html', 'Modules Overview'))

    myindex = codecs.open(course_folder+'/index.html','w','utf-8')
    for i in index:
        if len(i)==2:
            myindex.write(f"<a href='{i[0]}'>{i[1]}</a><br />\n")
        else:
            myindex.write(i)
    myindex.close()

    def resolve_module_item_link(item):
        item_type = (item.get('type') or '').lower()
        if item_type == 'page':
            return page_local_map.get(item.get('page_url')) or item.get('html_url')
        if item_type == 'assignment':
            return assignment_local_map.get(item.get('content_id')) or item.get('html_url')
        if item_type == 'discussion':
            return discussion_local_map.get(item.get('content_id')) or item.get('html_url')
        if item_type == 'file':
            return file_local_map.get(item.get('content_id')) or item.get('html_url')
        if item_type == 'externalurl':
            return item.get('external_url')
        if item_type in ('externaltool', 'quiz', 'assignmentquiz', 'attendance'):
            return item.get('html_url') or item.get('url')
        if item_type == 'subheader':
            return None
        return item.get('html_url') or item.get('url')

    module_index_path = course_folder + '/modules.html'
    with codecs.open(module_index_path, 'w', 'utf-8') as module_index:
        module_index.write('<html><body>\n')
        module_index.write(f"<h1>{courseinfo['name']} - Modules</h1>\n")
        for module in module_details:
            module_index.write(f"<h2>{module['name']}</h2>\n<ul>\n")
            for item in module['items']:
                title = item.get('title') or '(Untitled)'
                item_type = item.get('type') or 'Item'
                link = resolve_module_item_link(item)
                if item_type.lower() == 'subheader':
                    module_index.write(f"<li><strong>{title}</strong></li>\n")
                    continue
                if link:
                    module_index.write(f"<li><a href='{link}'>{title}</a> <em>({item_type})</em></li>\n")
                else:
                    module_index.write(f"<li>{title} <em>({item_type})</em></li>\n")
            module_index.write('</ul>\n')
        module_index.write('</body></html>\n')


    # Full course content in single file
    print("Writing main course files...")
    mycourse = codecs.open(course_folder+'/fullcourse.raw.html','w','utf-8')

    mycourse.write("<html><head></head><body>\n")

    for I in items:
        if I:
            mycourse.write(adjust_fullcourse_image_sources(I))
    mycourse.write("\n</body></html>")


    temp = open('cache/coursedump.txt','w')
    temp.write( "items:  " +  json.dumps(items,indent=2) )
    temp.write("\n\n\n")
    temp.write( "index:  " + json.dumps(index,indent=2) )
    temp.write("\n\n\n")
    #temp.write( "items_inorder:  " + json.dumps(items_inorder,indent=2) )
    #temp.write("\n\n\n")
    temp.write( "item_id_to_index:  " + json.dumps(item_id_to_index,indent=2) )


    if video_link_list:
        mycourse.write('\n<h1>Videos Linked in Pages</h1>\n<table>')
        for V in video_link_list:
            video_url, txt, pg = V
            mycourse.write("<tr><td><a target='_blank' href='"+video_url+"'>"+txt+"</a></td><td> on <a target='_blank' href='" + pg + "'>" + pg + "</a></td></tr>\n")
        mycourse.write("</table>\n")

    mycourse.close()
    try:
        pypandoc.convert_file(course_folder+'/fullcourse.raw.html', 'html', outputfile=course_folder+"/fullcourse.html")
    except Exception as e:
        print(f"couldn't create html fullcourse page: {e}")
    try:
        pypandoc.convert_file(course_folder+'/fullcourse.html', 'md', outputfile=course_folder+"/fullcourse.md")
    except Exception as e:
        print(f"couldn't create markdown fullcourse page: {e}")
    try:
        pypandoc.convert_file(course_folder+'/fullcourse.html', 'docx', outputfile=course_folder+"/fullcourse.docx")
    except Exception as e:
        print(f"couldn't create doc fullcourse page: {e}")


def restore_canvas_image_sources(html_fragment):
    soup = bs(html_fragment, features="lxml")
    changed = False
    for img in soup.find_all('img'):
        canvas_src = img.get('data-canvas-src')
        if canvas_src:
            img['src'] = canvas_src
            del img['data-canvas-src']
            changed = True
    body = soup.body
    if body:
        restored = ''.join(str(child) for child in body.children)
    else:
        restored = soup.decode()
    return restored, changed


def _push_page_update(course_num, page_slug, new_content):
    endpoint = f"{url}/api/v1/courses/{course_num}/pages/{page_slug}"
    data = {'wiki_page[body]': new_content}
    response = requests.put(endpoint, headers=header, params=data)
    if response.status_code >= 400:
        print(f" - Failed to upload {page_slug}: {response.status_code} {response.text}")
        return False
    print(f" - Uploaded {page_slug}")
    return True


def upload_modified_pages(course_id=None, confirm_each=False):
    if not course_id:
        course_id = input("course id> ").strip()
    if not course_id:
        print("No course id provided; aborting.")
        return

    course_folder = f"../course_temps/course_{course_id}"
    manifest_path = os.path.join(course_folder, 'pages_manifest.json')
    if not os.path.exists(manifest_path):
        print(f"No manifest found at {manifest_path}. Run course_download first.")
        return

    with codecs.open(manifest_path, 'r', 'utf-8') as manifest_file:
        manifest = json.loads(manifest_file.read())

    pages = manifest.get('pages', {})
    if not pages:
        print("Manifest contains no page entries.")
        return

    updated = False
    for slug, meta in pages.items():
        local_rel = meta.get('filename')
        local_path = os.path.join(course_folder, local_rel) if local_rel else None
        if not local_rel or not local_rel.startswith('pages/'):
            print(f" - Skipping {slug}: not a downloaded page ({local_rel})")
            continue
        if not local_path or not os.path.exists(local_path):
            print(f" - Skipping {slug}: local file missing ({local_rel})")
            continue

        with codecs.open(local_path, 'r', 'utf-8') as local_file:
            local_html = local_file.read()
        current_hash = hashlib.sha256(local_html.encode('utf-8')).hexdigest()
        if current_hash == meta.get('hash'):
            continue

        restored_html, changed = restore_canvas_image_sources(local_html)
        payload = restored_html if changed else local_html

        do_upload = True
        if confirm_each:
            ans = input(f"Upload changes for {slug}? [y/N]: ").strip().lower()
            do_upload = ans in ('y', 'yes')

        if not do_upload:
            print(f" - Skipped {slug} by user request")
            continue

        if _push_page_update(course_id, slug, payload):
            manifest['pages'][slug]['hash'] = current_hash
            updated = True

    if updated:
        with codecs.open(manifest_path, 'w', 'utf-8') as manifest_file:
            manifest_file.write(json.dumps(manifest, indent=2))
        print("Updated manifest hashes for uploaded pages.")
    else:
        print("No page uploads performed.")


def upload_modified_pages_prompt():
    upload_modified_pages()


def media_testing():
    user_id = 285  #ksmith
    t = f"https://gavilan.instructuremedia.com/api/public/v1/users/{user_id}/media"
    media = fetch(t,verbose=1,media=1)
    print(media)

def pan_testing():
    course_folder = '../course_temps/course_6862'
    pypandoc.convert_file(course_folder+'/fullcourse.md', 'html', outputfile=course_folder+"/fullcourse.v2.html")

# Given course, page url, and new content, upload the new revision of a page
def create_page(course_num,new_title,new_content):
    t3 = url + '/api/v1/courses/' + str(course_num) + '/pages'
    #xyz = raw_input('Enter 1 to continue and send back to: ' + t3 + ': ')
    #print("Creating page: %s\nwith content:%s\n\n\n" % (new_title,new_content))
    print("Creating page: %s" % new_title)
    xyz = input('type 1 to confirm: ')   #'1'
    if xyz=='1':
        data = {'wiki_page[title]':new_title, 'wiki_page[body]':new_content}
        r3 = requests.post(t3, headers=header, params=data)
        print(r3)
        print('ok')


def md_to_course():
    #input = 'C:/Users/peter/Nextcloud/Documents/gavilan/student_orientation.txt'
    #output = 'C:/Users/peter/Nextcloud/Documents/gavilan/stu_orientation/student_orientation.html'
    id = "11214"
    infile = 'cache/pages/course_%s.md' % id
    output = 'cache/pages/course_%s_fixed.html' % id
    output3 = pypandoc.convert_file(infile, 'html', format='md', outputfile=output)

    xx = codecs.open(output,'r','utf-8').read()
    soup = bs(  xx, features="lxml" )
    soup.encode("utf-8")

    current_page = ""
    current_title = ""

    for child in soup.body.children:
        if child.name == "h1" and not current_title:
            current_title = child.get_text()
        elif child.name == "h1":
            upload_page(id,current_title,current_page)
            current_title = child.get_text()
            current_page = ""
            print( "Next page: %s" % current_title )
        else:
            #print(dir(child))
            if 'prettify' in dir(child):
                current_page += child.prettify(formatter="html")
            else:
                current_page += child.string

    upload_page(id,current_title,current_page)
    print("Done")


# DL pages only
def grab_course_pages(course_num=-1):
    global results, results_dict, url, header
    # course_num = raw_input("What is the course id? ")
    if course_num<0:
        course_num = input("Id of course? ")
    else:
        course_num = str(course_num)
    modpagelist = []
    modurllist = []
    # We want things in the order of the modules
    t4 = url + '/api/v1/courses/'+str(course_num)+'/modules?include[]=items'
    results = fetch(t4)
    i = 1
    pageout = codecs.open('cache/pages/course_'+str(course_num)+'.html','w','utf-8')
    pageoutm = codecs.open('cache/pages/course_'+str(course_num)+'.md','w','utf-8')
    divider = "\n### "
    for M in results:
        print("Module Name: " + M['name'])
        for I in M['items']:
            if I['type']=='Page':
                modpagelist.append(I['title'])
                modurllist.append(I['page_url'])
                pageout.write(divider+I['title']+'### '+I['page_url']+'\n')
                easier_filename = clean_title(I['page_url'])
                print("  " + str(i) + ". " + I['title'])
                t2 = url + '/api/v1/courses/' + str(course_num) + '/pages/'+I['page_url']
                print('Getting: ' + t2)
                mypage = fetch(t2)
                fixed = safe_html(mypage['body'])
                if fixed:
                    #markdown = h2m.convert(fixed)
                    #p_data = pandoc.read(mypage['body'])
                    markdown = pypandoc.convert_text("\n<h1>" + I['title'] + "</h1>\n" + mypage['body'], 'md', format='html')
                    pageout.write(fixed+'\n')
                    pageoutm.write(markdown+'\n')
                    pageout.flush()
                i += 1
    pageout.close()
    pageoutm.close()

# Download, clean html, and reupload page
def update_page():
    global results, results_dict, url, header
    # course_num = raw_input("What is the course id? ")
    course_num = '6862'
    t = url + '/api/v1/courses/' + str(course_num) + '/pages'
    while t: t = fetch(t)
    pages = results
    results = []
    mypagelist = []
    myurllist = []
    modpagelist = []
    modurllist = []
    for p in pages:
        p['title'] = clean_title(p['title'])
        mypagelist.append(p['title'])
        myurllist.append(p['url'])
        easier_filename = clean_title(p['url'])
        #for a in 'title,updated_at,published'.split(','):
        #    print unicode(p[a]), "\t",
        #print ""

    # We want things in the order of the modules
    t4 = url + '/api/v1/courses/'+str(course_num)+'/modules?include[]=items'
    while t4: t4 = fetch(t4)
    mods = results
    results = []
    i = 1
    print("\nWhat page do you want to repair?")
    for M in mods:
        print("Module Name: " + M['name'])
        for I in M['items']:
            if I['type']=='Page':
                modpagelist.append(I['title'])
                modurllist.append(I['page_url'])
                print("  " + str(i) + ". " + I['title'])
                i += 1

    choice = input("\n> ")
    choice = int(choice) - 1
    chosen_url = modurllist[choice]
    print('Fetching: ' + modpagelist[choice])
    t2 = url + '/api/v1/courses/' + str(course_num) + '/pages/'+chosen_url
    print('From: ' + t2)

    results_dict = {}
    while(t2): t2 = fetch(t2)
    mypage = results_dict
    fixed_page = safe_html(mypage['body'])
    upload_page(course_num,chosen_url,fixed_page)

# given dict of file info (from files api), construct an img tag that works in a page
#def file_to_img_tag(f, alt, course, soup):
#    #tag = f"<img id=\"\" src=\"https://ilearn.gavilan.edu/courses/{course}/files/{f['id']}/preview\" alt=\"{f['filename']}\" "
#    #tag += f"data-api-endpoint=\"https://ilearn.gavilan.edu/api/v1/courses/{course}/files/{f['id']}\" data-api-returntype=\"File\" />"
#    return T


def html_file_to_page(filename, course, tags):

    try:
        soup = bs(codecs.open(filename,'r', 'utf-8').read(), 'html.parser')
    except Exception as e:
        print(f"Exception on {filename}: {e}")
        return
    img_tags = soup.find_all('img')

    result = {'title': soup.title.text if soup.title else ''}
    result['title'].strip()

    for img in img_tags:
        src = img['src']
        try:
            alt = img['alt']
        except:
            alt = src
        orig_filename = os.path.basename(src)
        if orig_filename in tags:
            T = soup.new_tag(name='img', src=f"https://ilearn.gavilan.edu/courses/{course}/files/{tags[orig_filename]['id']}/preview")
            T['id'] = tags[orig_filename]['id']
            T['alt'] = alt
            T['data-api-endpoint'] = f"https://ilearn.gavilan.edu/api/v1/courses/{course}/files/{tags[orig_filename]['id']}"
            T['data-api-returntype'] = "File"
            img.replace_with(T)
            print( f"   replaced image: {src}   alt: {alt}")
        else:
            print( f"   couldn't find replacement image: {src}   alt: {alt}")
    outfile = codecs.open(filename+"_mod.html", 'w', 'utf-8')
    outfile.write( soup.prettify() )
    outfile.close()
    result['body'] = ''.join(map(str, soup.body.contents)) if soup.body else ''
    return result

def create_new_page(course_id, title, body):
    print(f"Creating page: {title}, length: {len(body)}")
    request = f"{url}/api/v1/courses/{course_id}/pages"
    print(request)
    data = { 'wiki_page[title]': title, 'wiki_page[body]': body }
    r3 = requests.post(request, headers=header, data=data)
    try:
        result = json.loads(r3.text)
        print( f"    + ok: {result['url']}")
    except:
        print("    - problem creating page?")

# Given a folder full of html pages and their linked images, create Canvas PAGES of them
def make_pages_from_folder(folder='cache/csis6/', course = '20558'):
    if 0:
        request = f"{url}/api/v1/courses/{course}/files"
        print("Fetching course files")
        files = fetch(request)

        tempfile = codecs.open('cache/csis6filelist.json','w','utf-8')
        tempfile.write(json.dumps(files))
        tempfile.close()

    if 1:
        files = json.loads( codecs.open('cache/csis6filelist.json', 'r', 'utf-8').read())


    course_files = {f['filename']: f for f in files}
    tags = {}
    for f in files:
        if f['filename'].lower().endswith('.jpg') or f['filename'].lower().endswith('.png'):
            tags[f['filename']] = f


    contents = os.listdir(folder)
    contents = ['welcome.html','welcome2.html', 'welcome3.html']
    print(contents)
    for f in contents:
        m = re.search(r'^(.*)\.(html?)$', f)
        if m:
            print(f"html file: {m.group(1)}, extension: {m.group(2)}")
            newpage = html_file_to_page(folder+f, course, tags)
            create_new_page(course, newpage['title'], newpage['body'])
        else:
            m = re.search(r'^(.*)\.(.*)$', f)
            if m:
                print(f"other file: {m.group(1)}, extension: {m.group(2)}")
            else:
                print(f"unknown file: {f}")


# Given course, page url, and new content, upload the new revision of a page
def upload_page(course_num,pageurl,new_content):
    print(f"Uploading page: {pageurl}")
    #print new_content
    t3 = url + '/api/v1/courses/' + str(course_num) + '/pages/' + pageurl
    xyz = input('Enter 1 to continue and send back to: ' + t3 + ': ')
    #xyz = '1'
    if xyz=='1':
        data = {'wiki_page[body]':new_content}
        r3 = requests.put(t3, headers=header, params=data)
        print(r3)
        print('ok')


def multiple_downloads():

    x = input("What IDs? Separate with one space: ")
    for id in x.split(" "):
        course_download(id)


def fetch_support_page():
    u = "https://ilearn.gavilan.edu/courses/20850/pages/online-student-support-hub"
    course_num = 20850
    page_url = "online-student-support-hub"
    t2 = f"{url}/api/v1/courses/{course_num}/pages/{page_url}"
    print('Getting: ' + t2)
    mypage = fetch(t2)
    print(json.dumps(mypage,indent=2))
    print(mypage['body'])


from courses import getCoursesInTerm

def clear_old_page(shell_id,page_name):
    # get all pages
    t = f"{url}/api/v1/courses/{shell_id}/pages"
    pages = fetch(t)
    for page in pages:
        if page['title'] == page_name:
            print(f"found a page named {page_name}. Deleting it.")
            id = page['page_id']
            t2 = f"{url}/api/v1/courses/{shell_id}/pages/{id}"
            r2 = requests.delete(t2, headers=header)
            print(f"{r2}")

def add_support_page_full_semester(term=289):
    print("Fetching list of all active courses")
    # term = 184 # fa24   # 182
    c = getCoursesInTerm(term,0,0)  # sp25 = 287   wi24=182

    #print(c)

    check = 'each'
    print("answer 'all' to do the rest without confirming")

    for C in c:
        if check == 'each':
            answer = input(f"Type 1 <enter> to add support page to {C['id']} ({C['name']}) ")
            if answer == '1':
                create_support_page(C['id'])
            else:
                if answer == 'all':
                    check = 'all'
                    create_support_page(C['id'])
                    continue
        elif check == 'all':
            create_support_page(C['id'])

def create_support_page(shell_id=18297):    # 29):

    # clear one of same name first.
    clear_old_page(shell_id, "Online Student Support Hub")

    # make new one
    t3 = f"{url}/api/v1/courses/{shell_id}/pages/online-student-support-hub"
    new_content = codecs.open("cache/support_min.html","r","utf-8").read()
    title = "Online Student Support Hub"
    data = {'wiki_page[body]':new_content, 'wiki_page[title]':title, 'wiki_page[published]':"true"}
    r3 = requests.put(t3, headers=header, params=data)
    #print(r3.content)

    print('Page Created')
    try:
        response = r3.json()
        print(f"page id: {response['page_id']}")
    except Exception as e:
        print(f"Exception: {e}")


    # list modules
    # GET /api/v1/courses/:course_id/modules
    t4 = f"{url}/api/v1/courses/{shell_id}/modules"
    modules = fetch(t4)
    module_id = 0

    # what if there are no modules?
    if len(modules) == 0:
        t6 = f"{url}/api/v1/courses/{shell_id}/modules/"
        mod_data = {'module[name]': 'Welcome', 'module[unlock_at]':"2024-01-01T06:00:00-08:00"}
        r6 = requests.post(t6, headers=header, params=mod_data)
        mod_response = r6.json()
        module_id = mod_response['id']
        print(f"created module, id: {module_id}")

        # publish module
        t7 = f"{url}/api/v1/courses/{shell_id}/modules/{module_id}"
        mod_data2 = {'module[published]':'true'}
        r6 = requests.put(t7, headers=header, params=mod_data2)

    for M in modules:
        if M['position'] == 1:
            module_id = M['id']
            print(f"found first module 1: ({module_id}) {M['name']}")
    #print(json.dumps(modules,indent=2))
    #
    # create module item
    # POST /api/v1/courses/:course_id/modules/:module_id/items
    t5 = f"{url}/api/v1/courses/{shell_id}/modules/{module_id}/items"
    item_data = {'module_item[title]': title, 'module_item[type]': 'Page', 'module_item[page_url]': response['url'], 'module_item[position]':1}
    r5 = requests.post(t5, headers=header, params=item_data)

    print('ok')

def list_modules_and_items(shell_id, verbose=0):
    modules = fetch(f"{url}/api/v1/courses/{shell_id}/modules?include[]=items&include[]=content_details")
    if verbose: print(json.dumps(modules,indent=2))
    return modules

def check_modules_for_old_orientation():
    from util import contains_key_value, find_dict_with_key_value, extract_key_values

    checklist = []

    for term in [286, 287]:   # wi25, sp25

        print("Fetching list of all active courses")
        #term = 287 # 184 # fa24   # 182
        #term = 286  # wi25
        c = getCoursesInTerm(term,0,0)  # sp25 = 287   wi24=182

        for C in c:
            print(f"{C['id']} - {C['name']}")
            m = list_modules_and_items(C['id'])

            if contains_key_value(m, 'name', 'Online Student Support Services - Summer & Fall 2024'):
                old_mod = find_dict_with_key_value(m,'name','Online Student Support Services - Summer & Fall 2024')

                print("   this course has the old module")
                checklist.append(f"{C['id']}")
                titles = extract_key_values(old_mod, 'title')
                [ print(f"  {T}") for T in titles ]

    print(f"\nCheck these course ids:")
    for id in checklist:
        print(id)


def repair_ezproxy_links():
    from localcache2 import pages_in_term

    # get all pages in term
    all_pages = pages_in_term()

    # c.id, c.course_code, c.sis_source_id, wp.id as wp_id, wp.title, wp.url, c.name , wp.body
    for p in all_pages:
        course = p[1]
        title = p[4]
        url = p[5]
        body = p[7]
        # print(body)
        try:
            #s = re.search('''["']https:\/\/ezproxy\.gavilan\.edu\/login\?url=(.*)["']''',body)
            a = re.search(r'Online Library Services',title)
            if a:
                continue
            s = re.findall('\n.*ezproxy.*\n',body)
            if s:
                print(course, title, url)
                print("   ", s, "\n")    # s.group())
        except Exception as e:
            #print(f"Skipped: {title},   {e}")
            pass


def download_web():
    import argparse, os, re, time, hashlib, mimetypes, subprocess
    from collections import deque
    from urllib.parse import urlsplit, urlunsplit, urljoin
    import posixpath as ppath
    import requests
    from lxml import html

    SESSION = requests.Session()
    SESSION.headers.update({
        "User-Agent": "MiniXPathCrawler/1.0 (+for personal archiving; contact admin if issues)"
    })

    def normalize_path(path: str) -> str:
        np = ppath.normpath(path or "/")
        if not np.startswith("/"):
            np = "/" + np
        return np

    def base_dir_of(path: str) -> str:
        # Ensure trailing slash for folder comparison
        if not path or path.endswith("/"):
            bd = path or "/"
        else:
            bd = ppath.dirname(path) + "/"
        bd = normalize_path(bd)
        if not bd.endswith("/"):
            bd += "/"
        return bd

    def canonical_url(u: str, drop_query=True) -> str:
        sp = urlsplit(u)
        path = normalize_path(sp.path)
        if drop_query:
            sp = sp._replace(path=path, query="", fragment="")
        else:
            sp = sp._replace(path=path, fragment="")
        return urlunsplit(sp)

    def same_folder_or_below(start_url: str, link_url: str) -> bool:
        su = urlsplit(start_url); lu = urlsplit(link_url)
        if su.scheme != lu.scheme or su.netloc != lu.netloc:
            return False
        bd = base_dir_of(su.path)  # e.g., "/a/b/"
        tp = normalize_path(lu.path)  # e.g., "/a/b/page.html"
        return (tp == bd[:-1]) or tp.startswith(bd)

    def is_html_response(resp: requests.Response) -> bool:
        ctype = resp.headers.get("Content-Type", "")
        return "html" in ctype.lower()

    def fetch_html(url: str, timeout=20):
        try:
            r = SESSION.get(url, timeout=timeout, allow_redirects=True)
        except requests.RequestException:
            return None, None
        if r.status_code != 200 or not is_html_response(r):
            return None, None
        try:
            doc = html.fromstring(r.content)
        except Exception:
            return None, None
        # make links absolute for easier handling of images and hrefs
        doc.make_links_absolute(r.url)
        return r, doc

    def safe_filename_from_url(u: str, default_ext=".bin") -> str:
        # hash + best-effort extension
        h = hashlib.sha1(u.encode("utf-8")).hexdigest()[:16]
        ext = ""
        path = urlsplit(u).path
        if "." in path:
            ext = "." + path.split(".")[-1].split("?")[0].split("#")[0]
            if not re.match(r"^\.[A-Za-z0-9]{1,5}$", ext):
                ext = ""
        return h + (ext or default_ext)

    def download_image(img_url: str, assets_dir: str) -> str | None:
        try:
            r = SESSION.get(img_url, timeout=20, stream=True)
        except requests.RequestException:
            return None
        if r.status_code != 200:
            return None
        # extension: prefer from Content-Type
        ext = None
        ctype = r.headers.get("Content-Type", "")
        if "/" in ctype:
            ext_guess = mimetypes.guess_extension(ctype.split(";")[0].strip())
            if ext_guess:
                ext = ext_guess
        fname = safe_filename_from_url(img_url, default_ext=ext or ".img")
        os.makedirs(assets_dir, exist_ok=True)
        fpath = os.path.join(assets_dir, fname)
        try:
            with open(fpath, "wb") as f:
                for chunk in r.iter_content(65536):
                    if chunk:
                        f.write(chunk)
        except Exception:
            return None
        return fpath

    def html_fragment_from_xpath(doc, xpath_expr: str, assets_dir: str):
        nodes = doc.xpath(xpath_expr)
        if not nodes:
            return None, None  # (html_fragment, title)
        # Remove <script>/<style> inside nodes
        for n in nodes:
            for bad in n.xpath(".//script|.//style|.//noscript"):
                bad.getparent().remove(bad)

        # Download images and rewrite src
        for n in nodes:
            for img in n.xpath(".//img[@src]"):
                src = img.get("src")
                if not src:
                    continue
                local = download_image(src, assets_dir)
                if local:
                    # Use relative path from markdown file location later (we'll keep md in parent of assets)
                    rel = os.path.join("assets", os.path.basename(local)).replace("\\", "/")
                    img.set("src", rel)

        frag_html = "".join(html.tostring(n, encoding="unicode") for n in nodes)
        # Title from <title> or first heading in fragment
        doc_title = (doc.xpath("string(//title)") or "").strip()
        if not doc_title:
            h = html.fromstring(frag_html)
            t2 = (h.xpath("string(//h1)") or h.xpath("string(//h2)") or "").strip()
            doc_title = t2 or "Untitled"
        return frag_html, doc_title

    def html_to_markdown_with_pandoc(html_str: str) -> str:
        try:
            p = subprocess.run(
                ["pandoc", "-f", "html", "-t", "gfm"],
                input=html_str.encode("utf-8"),
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE,
                check=False,
            )
            if p.returncode == 0:
                return p.stdout.decode("utf-8", errors="ignore")
            # fallback to raw HTML if conversion failed
            return html_str
        except FileNotFoundError:
            # pandoc missing; return raw HTML
            return html_str

    def build_docx_from_markdown(md_path: str, out_docx: str, resource_path: str):
        # Create .docx with ToC
        cmd = [
            "pandoc",
            "-s",
            md_path,
            "-o",
            out_docx,
            "--toc",
            "--toc-depth=3",
            f"--resource-path={resource_path}",
            "--from=markdown+raw_html",
        ]
        subprocess.run(cmd, check=True)

    def crawl(start_url: str, xpath_expr: str, out_dir: str, max_pages: int, delay: float):
        os.makedirs(out_dir, exist_ok=True)
        assets_dir = os.path.join(out_dir, "assets")
        os.makedirs(assets_dir, exist_ok=True)

        visited = set()
        q = deque([start_url])

        md_sections = []

        base_folder = base_dir_of(urlsplit(start_url).path)

        while q and len(visited) < max_pages:
            url = q.popleft()
            canon = canonical_url(url)
            if canon in visited:
                continue
            visited.add(canon)

            resp, doc = fetch_html(url)
            if doc is None:
                print(f"[skip] Non-HTML or fetch failed: {url}")
                continue

            # Extract and rewrite images for the chosen XPath fragment
            frag_html, title = html_fragment_from_xpath(doc, xpath_expr, assets_dir)
            if frag_html:
                md = html_to_markdown_with_pandoc(frag_html)
                section = f"# {title}\n\n_Source: {resp.url}_\n\n{md}\n"
                md_sections.append(section)
                print(f"[ok]  {resp.url}")

            # Enqueue in-scope links (from the whole page)
            for a in doc.xpath("//a[@href]"):
                href = a.get("href")
                if not href:
                    continue
                absu = urljoin(resp.url, href)
                # Drop fragments for comparison/enqueue
                absu_nf = urlunsplit(urlsplit(absu)._replace(fragment=""))
                if absu_nf in visited:
                    continue
                if same_folder_or_below(start_url, absu_nf):
                    q.append(absu_nf)

            time.sleep(delay)

        merged_md = os.path.join(out_dir, "merged.md")
        with open(merged_md, "w", encoding="utf-8") as f:
            f.write("\n\n".join(md_sections))

        out_docx = os.path.join(out_dir, "merged.docx")
        try:
            build_docx_from_markdown(merged_md, out_docx, out_dir)
        except subprocess.CalledProcessError as e:
            print("[warn] pandoc failed to create .docx:", e)

        print(f"\nDone.\nMarkdown: {merged_md}\nWord:     {out_docx}\nPages:    {len(md_sections)} (in scope)")

    myurl = "https://govt.westlaw.com/calregs/Browse/Home/California/CaliforniaCodeofRegulations?guid=I2A5DA5204C6911EC93A8000D3A7C4BC3&originationContext=documenttoc&transitionType=Default&contextData=(sc.Default)"
    crawl(myurl, '//*[@id="co_contentColumn"]', "cache/content", 600, 0.65)


def flowgrid():
    # a tiny DSL for lane/step "flow grid" diagrams rendered to HTML.

    from dataclasses import dataclass, field
    from pathlib import Path
    from typing import List, Optional, Dict
    import re
    import html

    # ---------------------- Data model ----------------------

    @dataclass
    class Step:
        code: str
        label: str
        weeks: Optional[str] = None
        hours: Optional[str] = None
        tag: Optional[str] = None   # 'req' | 'rec' | None
        desc: Optional[str] = None  # override/extra text for subline
        klass: Optional[str] = None # additional css class

    @dataclass
    class Lane:
        name: str
        steps: List[Step] = field(default_factory=list)

    @dataclass
    class Doc:
        title: str
        lanes: List[Lane] = field(default_factory=list)
        css_vars: Dict[str, str] = field(default_factory=dict)

    # ---------------------- Parser ----------------------

    def parse_spec(text: str) -> Doc:
        """
        DSL syntax:
        - Comments start with '#'
        - KEY: value  (supported keys: TITLE, VAR)
            VAR: --step=260px; --arrow=34px; --done=110px (semicolon separated; optional)
        - LANE: <name>
        STEP: CODE | LABEL | weeks=2; hours=20; tag=req
        STEP: CODE | LABEL | desc=1 hour on-site; tag=rec
        - Empty lines are ignored.
        - Indentation is optional and only for readability.
        """
        title = "Untitled Diagram"
        lanes: List[Lane] = []
        current_lane: Optional[Lane] = None
        css_vars: Dict[str, str] = {}

        for raw in text.splitlines():
            line = raw.strip()
            if not line or line.startswith("#"):
                continue

            # KEY: value
            m = re.match(r'(?i)TITLE\s*:\s*(.+)$', line)
            if m:
                title = m.group(1).strip()
                continue

            # VAR line
            m = re.match(r'(?i)VAR\s*:\s*(.+)$', line)
            if m:
                # semicolon separated k=v; allow CSS custom props like --step=300px
                blob = m.group(1)
                parts = [p.strip() for p in blob.split(";") if p.strip()]
                for p in parts:
                    if "=" in p:
                        k, v = p.split("=", 1)
                        css_vars[k.strip()] = v.strip()
                continue

            # LANE
            m = re.match(r'(?i)LANE\s*:\s*(.+)$', line)
            if m:
                current_lane = Lane(name=m.group(1).strip())
                lanes.append(current_lane)
                continue

            # STEP
            m = re.match(r'(?i)STEP\s*:\s*(.+)$', line)
            if m:
                if current_lane is None:
                    raise ValueError("STEP appears before any LANE is defined.")
                body = m.group(1)
                # Expect: CODE | LABEL | attrs
                parts = [p.strip() for p in body.split("|")]
                if len(parts) < 2:
                    raise ValueError(f"STEP needs 'CODE | LABEL | ...' got: {body}")
                code = parts[0]
                label = parts[1]
                attrs_blob = parts[2] if len(parts) >=3 else ""

                # Parse attrs: key=value; key=value
                step_kwargs = {}
                if attrs_blob:
                    for kv in [a.strip() for a in attrs_blob.split(";") if a.strip()]:
                        if "=" in kv:
                            k, v = kv.split("=", 1)
                            step_kwargs[k.strip().lower()] = v.strip()
                        else:
                            # allow bare tag 'req' or 'rec'
                            if kv.lower() in ("req", "rec"):
                                step_kwargs["tag"] = kv.lower()

                step = Step(
                    code=code,
                    label=label,
                    weeks=step_kwargs.get("weeks") or step_kwargs.get("w"),
                    hours=step_kwargs.get("hours") or step_kwargs.get("hrs") or step_kwargs.get("h"),
                    tag=normalize_tag(step_kwargs.get("tag")),
                    desc=step_kwargs.get("desc"),
                    klass=step_kwargs.get("class") or step_kwargs.get("klass"),
                )
                current_lane.steps.append(step)
                continue

            raise ValueError(f"Unrecognized line: {line}")

        return Doc(title=title, lanes=lanes, css_vars=css_vars)

    def normalize_tag(tag: Optional[str]) -> Optional[str]:
        if not tag:
            return None
        t = tag.lower().strip()
        if t in ("req", "required"):
            return "req"
        if t in ("rec", "recommended"):
            return "rec"
        if t in ("none", "na", "n/a", "optional"):
            return None
        return t

    # ---------------------- HTML rendering ----------------------

    BASE_CSS = r"""
    :root{
    --ink:#0f172a;
    --reqBorder:#2e7d32; --reqFill:#eef7ef;
    --recBorder:#8a8a8a; --recFill:#ffffff;
    --doneBorder:#9ca3af; --doneInk:#475569;
    --modeCol:180px; --gap:12px;
    --step:260px; --arrow:34px; --done:110px;
    }
    html,body{margin:0;background:#f6f7fb;font-family:system-ui,-apple-system,Segoe UI,Roboto,Arial,sans-serif;color:var(--ink)}
    .wrap{margin:24px auto 48px;padding:0 16px}
    h1{font-size:22px;margin:0 0 8px}
    .legend{display:flex;gap:18px;align-items:center;font-size:14px;margin:6px 0 24px}
    .tag{display:inline-block;padding:2px 8px;border-radius:999px;border:1.5px solid var(--reqBorder);background:var(--reqFill);font-size:12px}
    .tag.rec{border-color:var(--recBorder);background:var(--recFill);border-style:dashed}
    .grid{display:flex;flex-direction:column;gap:18px}
    .lane{display:grid;grid-template-columns:var(--modeCol) 1fr;gap:var(--gap);align-items:center;background:#ffffffcc;padding:12px;border-radius:12px}
    .mode{font-weight:700;text-align:center;background:#fff;padding:16px 10px}
    .flow{display:grid;align-items:center;gap:8px;padding:8px 0;}
    .header {grid-column: span 4; }
    .step{border-radius:10px;padding:10px 12px;border:2px solid var(--reqBorder);background:var(--reqFill);min-height:64px}
    .step .title{font-weight:700}
    .step .sub{font-size:12px;opacity:.8}
    .step.rec{border-color:var(--recBorder);border-style:dashed;background:var(--recFill)}
    .slot{}
    .arrow{font-size:22px;line-height:1;text-align:center}
    .arrow.blank{color:transparent}
    .done{justify-self:start;border-radius:999px;border:2px dashed var(--doneBorder);padding:10px 14px;color:var(--doneInk);background:#fff;text-align:center}
    @media (max-width:900px){
    .lane{grid-template-columns:1fr}
    .mode{order:-1}
    .flow{grid-template-columns:1fr; background:none}
    .arrow{display:none}
    }
    """

    def format_sub(step: Step) -> str:
        if step.desc:
            core = html.escape(step.desc)
        else:
            bits = [html.escape(step.label)]
            wh = []
            if step.weeks:
                wh.append(f"{html.escape(str(step.weeks))} weeks")
            if step.hours:
                wh.append(f"~{html.escape(str(step.hours))} hrs")
            if wh:
                bits.append(" · " + " (".join([wh[0], " ".join(wh[1:])]) + ")" if len(wh)>1 else " · " + wh[0])
                # Actually, the original used " · 2 weeks (~20 hrs)"
                # Let's just do that directly:
                if step.weeks and step.hours:
                    bits[-1] = f" · {html.escape(str(step.weeks))} weeks (~{html.escape(str(step.hours))} hrs)"
            # Combine
            core = "".join(bits)
        # Tag
        if step.tag == "req":
            tag_html = '<span class="tag">Required</span>'
        elif step.tag == "rec":
            tag_html = '<span class="tag rec">Recommended</span>'
        else:
            tag_html = ""
        if tag_html:
            return f'{core} · {tag_html}'
        return core

    def render_html(doc: Doc) -> str:
        max_steps = max((len(l.steps) for l in doc.lanes), default=1)
        # grid-template-columns: repeat(max_steps, var(--step) var(--arrow)) var(--done)
        pairs = " ".join(["var(--step) var(--arrow)"] * max_steps) + " var(--done)"
        css_vars_block = ""
        if doc.css_vars:
            css_vars_block = ":root{\n" + "\n".join([f"  {k}: {v};" for k,v in doc.css_vars.items()]) + "\n}\n"

        html_parts = []
        html_parts.append("<!DOCTYPE html><html><head><meta charset='utf-8'>")
        html_parts.append("<meta name='viewport' content='width=device-width, initial-scale=1'>")
        html_parts.append("<title>" + html.escape(doc.title) + "</title>")
        html_parts.append("<style>")
        html_parts.append(BASE_CSS)
        if css_vars_block:
            html_parts.append(css_vars_block)
        html_parts.append(f".flow{{grid-template-columns:{pairs};}}")
        html_parts.append("</style></head><body>")
        html_parts.append("<div class='wrap'><div class='grid'>")

        # Header/Title lane
        html_parts.append("<div class='lane'><div class='mode'>&nbsp;</div><div class='flow'><div class='header'><h1>")
        html_parts.append(html.escape(doc.title))
        html_parts.append("</h1></div></div></div>")

        for lane in doc.lanes:
            html_parts.append("<div class='lane'>")
            html_parts.append(f"<div class='mode'>{html.escape(lane.name)}</div>")
            html_parts.append("<div class='flow'>")
            for idx, step in enumerate(lane.steps):
                cls = "step"
                if step.tag == "rec":
                    cls += " rec"
                if step.klass:
                    cls += " " + html.escape(step.klass)
                html_parts.append(f"<div class='{cls}'>")
                html_parts.append(f"<div class='title'>{html.escape(step.code)}</div>")
                html_parts.append(f"<div class='sub'>{format_sub(step)}</div>")
                html_parts.append("</div>")  # step
                # arrow after every step unless it's the last visible step
                html_parts.append("<div class='arrow'>→</div>")
            # Fill remaining slots (if any)
            for _ in range(max_steps - len(lane.steps)):
                html_parts.append("<div class='slot'></div>")
                html_parts.append("<div class='arrow blank'>→</div>")

            # Done bubble
            html_parts.append("<div class='done'>Done</div>")
            html_parts.append("</div></div>")  # flow + lane

        html_parts.append("</div></div></body></html>")
        return "".join(html_parts)


    spec_text = '''
TITLE: Online Teaching Requirements and Recommendations
# Optional CSS overrides
VAR: --step=180px; --modeCol=180px

LANE: In Person (with Canvas)
  STEP: GOTT 1 | Intro to Online Teaching with Canvas | weeks=2; hours=20; tag=rec

LANE: Online
  STEP: GOTT 1 | Intro to Online Teaching with Canvas | weeks=2; hours=20; tag=req
  STEP: GOTT 2 | Introduction to Asynchronous Online Teaching and Learning | weeks=4; hours=40; tag=req

LANE: Hybrid
  STEP: GOTT 1 | Intro to Online Teaching with Canvas | weeks=2; hours=20; tag=req
  STEP: GOTT 2 | Introduction to Asynchronous Online Teaching and Learning | weeks=4; hours=40; tag=req
  STEP: GOTT 5 | Essentials of Blended Learning | weeks=2; hours=20; tag=rec

LANE: Online Live
  STEP: GOTT 1 | Intro to Online Teaching with Canvas | weeks=2; hours=20; tag=req
  STEP: GOTT 2 | Introduction to Asynchronous Online Teaching and Learning | weeks=4; hours=40; tag=req
  STEP: GOTT 6 | Introduction to Live Online Teaching and Learning | weeks=2; hours=20; tag=rec

LANE: HyFlex
  STEP: GOTT 1 | Intro to Online Teaching with Canvas | weeks=2; hours=20; tag=req
  STEP: GOTT 2 | Introduction to Asynchronous Online Teaching and Learning | weeks=4; hours=40; tag=req
  STEP: GOTT 6 | Introduction to Live Online Teaching and Learning | weeks=2; hours=20; tag=rec
  # You can override the subline using desc=
  STEP: HyFlex Tech Training | ~1 hour on-site | desc=~1 hour on-site; tag=rec

'''
    doc = parse_spec(spec_text)
    out_html = render_html(doc)
    Path('cache/flow.html').write_text(out_html, encoding="utf-8")
    print(f"Wrote cache/flow.html")


################
################  GOOGLE DOCS HELPERS (moved from pipelines)
################

def sec(t): return "<h3>"+t+"</h3>\n"
def para(t): return "<p>"+t+"</p>\n"
def ul(t): return "<ul>"+t+"</ul>\n"
def li(t): return "<li>"+t+"</li>\n"

def question(t,bracket=1):
    ret = ''
    match = re.search( r'\[(.*)\]', t)
    if match and bracket:
        ret += "<a name='" + match.group(1) + "'></a>"
        t = re.sub( r'\[.*\]','',t)
    else:
        parts = t.split(' ')
        id = ''
        for p in parts:
            if re.search(r'[a-zA-Z]',p[0]): id += p[0]
        ret += "<a name='%s'></a>" % id.lower()
    return ret + '<div class="accordion" data-accordion=""><h4 class="acrd_cntl">' + t + '</h4>\n<div class="acrd_cntnt">'

def answer(t):
    return t + '</div></div>\n'

def read_paragraph_element(element,type="NORMAL_TEXT"):
    text_run = element.get('textRun')
    begin = ''
    end = ''
    if not text_run:
        return ''
    if 'textStyle' in text_run and 'link' in text_run['textStyle']:
        begin = '<a href="' + text_run['textStyle']['link']['url'] + '">'
        end = '</a>'
    if 'textStyle' in text_run and 'bold' in text_run['textStyle'] and text_run['textStyle']['bold']==True and type=="NORMAL_TEXT":
        begin = '<strong>' + begin
        end = end + '</strong>'
    content = text_run.get('content')
    content = re.sub(u'\u000b','<br />\n',content)
    return begin + content + end

def read_paragraph_element_2(element,type="NORMAL_TEXT"):
    return read_paragraph_element(element,type)


# t is a string that begins with "Icons: " ... and contains comma(space) separated list
def handle_icons(t):
    text = t[7:].strip()
    parts = text.split(", ")
    return ('icons',parts)

# t is a string that begins with "Tags: " ... and contains comma(space) separated list
def handle_tags(t):
    text = t[6:].strip()
    parts = text.split(", ")
    return ('tags',parts)

def handle_question(t,bracket=1):
    anchor = ''
    match = re.search( r'\[(.*)\]', t)
    if match and bracket:
        anchor = match.group(1).lower()
        t = re.sub( r'\[.*\]','',t)
    else:
        parts = t.split(' ')
        for p in parts:
            if re.search(r'[a-zA-Z]',p[0]): anchor += p[0].lower()
    return ('question', t, anchor)

def handle_answer(t):
    return ('answer',t)

def handle_sec(t): return ('section',t)
def handle_para(t): return ('paragraph',t)
def handle_ul(t): return ('unorderdedlist',t)
def handle_li(t): return ('listitem',t)


img_count = 1
img_lookup = {}
img_heights = {}
img_widths = {}


'''def fetch_doc_image(k,value):
    global img_count, img_lookup, img_heights, img_widths
    if 'inlineObjectProperties' in value:
        if 'embeddedObject' in value['inlineObjectProperties']:
            if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
                if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
                    print(k)
                    uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
                    response = requests.get(uu, stream=True)
                    name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
                    img_count += 1
                    img_lookup[k] = name

                    with open('cache/doc_images/'+name, 'wb') as out_file:
                        shutil.copyfileobj(response.raw, out_file)
                    print(uu)
                    print(response.headers)
                    print(name)
                    del response
            if 'size' in  value['inlineObjectProperties']['embeddedObject']:
                img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
                img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])
'''


def fetch_doc_image(k,value):
    import shutil
    if 'inlineObjectProperties' in value:
        if 'embeddedObject' in value['inlineObjectProperties']:
            if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
                if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
                    uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
                    response = requests.get(uu, stream=True)
                    name = 'image_' + str(k) + '.' + response.headers['content-type'].split('/')[1]
                    with open('cache/doc_images/'+name, 'wb') as out_file:
                        shutil.copyfileobj(response.raw, out_file)
                    del response
    return True

def get_doc(docid, bracket=1, verbose=0):
    import pickle, shutil
    import os.path
    from googleapiclient.discovery import build
    from google_auth_oauthlib.flow import InstalledAppFlow
    from google.auth.transport.requests import Request

    #ooout = open(fileout,'w')

    # If modifying these scopes, delete the file token.pickle.
    SCOPES = ['https://www.googleapis.com/auth/documents.readonly']
    creds = None
    # The file token.pickle stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first
    # time.
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    service = build('docs', 'v1', credentials=creds)

    # Retrieve the documents contents from the Docs service.
    document = service.documents().get(documentId=docid).execute()
    if verbose: print(document)

    tempout = codecs.open('cache/trash/gdoctemp.txt','w','utf-8')
    tempout.write( json.dumps(document,indent=2) + "\n\n\n------------------------------------\n\n")
    if verbose: print('The title of the document is: {}'.format(document.get('title')))
    doc_content = document.get('body').get('content')
    if verbose: print(doc_content)

    doc_objects = document.get('inlineObjects')
    if verbose: print(doc_objects)

    doc_lists = document.get('lists')

    text = '<div class="acrd_grp" data-accordion-group="">'
    last_type = ''
    answer_text = ''
    in_a_list = ''

    img_count = 1
    img_lookup = {}
    img_heights = {}
    img_widths = {}

    if doc_objects:
        for k,value in doc_objects.items():
            tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
            if 'inlineObjectProperties' in value:
                if 'embeddedObject' in value['inlineObjectProperties']:
                    if 'imageProperties' in value['inlineObjectProperties']['embeddedObject']:
                        if 'contentUri' in value['inlineObjectProperties']['embeddedObject']['imageProperties']:
                            print(k)
                            uu = value['inlineObjectProperties']['embeddedObject']['imageProperties']['contentUri']
                            response = requests.get(uu, stream=True)
                            name = 'image_' + str(img_count) + '.' + response.headers['content-type'].split('/')[1]
                            img_count += 1

                            img_lookup[k] = name

                            with open('cache/doc_images/'+name, 'wb') as out_file:
                                shutil.copyfileobj(response.raw, out_file)
                            print(uu)
                            print(response.headers)
                            print(name)
                            #input('x?')
                            del response
                    if 'size' in  value['inlineObjectProperties']['embeddedObject']:
                        img_heights[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['height']['magnitude'])
                        img_widths[k] = int(value['inlineObjectProperties']['embeddedObject']['size']['width']['magnitude'])

    tempout.write('- - - - - - - -\n\n')
    #for value in doc_lists:
    #    tempout.write( json.dumps(value,indent=2) + "\n\n\n--\n\n")

    tempout.write('- - - - - - - -\n\n')
    list_stack = []
    list_depth = 0
    last_list_depth = 0
    for value in doc_content:
        tempout.write( json.dumps(value,indent=2) + "\n\n\n")
        if verbose: print(json.dumps(value, sort_keys=True, indent=4))

        # todo: x link, x bold, list, image.
        tag_fxn = para
        if 'paragraph' in value:
            this_text = ''

            if 'bullet' in value['paragraph']:
                # either we're (1)starting a new list, (2)in one, (3)starting a nested one, or (4)finished a nested one.

                lid = value['paragraph']['bullet']['listId']

                if not list_stack:  # 1
                    list_stack.append(lid)
                else:
                    if lid == list_stack[0]:   # 2
                        pass

                    else:
                        if not lid in list_stack:   # 3
                            list_stack.append(lid)
                        else:                       # 4
                            x = list_stack.pop()
                            while x != lid: list_stack.pop()
            elif len(list_stack) > 0:                           #  current para isn't a bullet but we still have a list open.
                list_stack = []

            list_depth = len(list_stack)

            deeper = list_depth - last_list_depth

            if deeper > 0:
                answer_text += "<ul>" * deeper
            elif deeper < 0:
                deeper = -1 * deeper
                answer_text += "</ul>" * deeper

            if len(list_stack):
                tag_fxn = li

            elements = value.get('paragraph').get('elements')

            # inlineObjectElement": {
            # "inlineObjectId": "kix.ssseeu8j9cfx",

            if 'paragraphStyle' in value.get('paragraph'):
                style = value.get('paragraph').get('paragraphStyle')
                #text += json.dumps(style, sort_keys=True, indent=4)
                if 'namedStyleType' in style:
                    type = style['namedStyleType']

            for elem in elements:

                # text content
                this_text += read_paragraph_element(elem,type)

                # image content
                if 'inlineObjectElement' in elem:
                    vpi = elem['inlineObjectElement']
                    if 'inlineObjectId' in vpi:
                        ii = vpi['inlineObjectId']
                        if ii in img_lookup:
                            img = img_lookup[ii]
                            h = img_heights[ii]
                            w = img_widths[ii]
                            this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)


            if last_type=='NORMAL_TEXT' and type!=last_type:
                text += answer(answer_text)
                answer_text = ''

            if type=='HEADING_2':
                text += sec(this_text)
                this_text = ''
            elif type=='HEADING_3':
                text += question(this_text,bracket)
                this_text = ''
            else:
                answer_text += tag_fxn(this_text)
                this_text = ''
            last_type = type
            last_list_depth = list_depth

        elif 'table' in value:
            # The text in table cells are in nested Structural Elements and tables may be
            # nested.
            text += "\nTABLE\n"
            #table = value.get('table')
            #for row in table.get('tableRows'):
            #    cells = row.get('tableCells')
            #    for cell in cells:
            #        text += read_strucutural_elements(cell.get('content'))
        #elif 'tableOfContents' in value:
        #    # The text in the TOC is also in a Structural Element.
        #    toc = value.get('tableOfContents')
        #    text += read_strucutural_elements(toc.get('content'))

        #else:
        #    print(json.dumps(value, sort_keys=True, indent=4))

    text += answer(answer_text)
    #text += '</div>'
    #print(text)
    return text


'''#text =
    result = []
    last_type = ''
    #answer_text = ''
    answer = []
    in_a_list = ''

    # Get all the images
    for k,value in doc_objects.items():
        tempout.write( "->" + k + "=" + json.dumps(value,indent=2) + "\n\n\n--\n\n")
        fetched = fetch_doc_image(k,value)

    list_stack = []
    list_depth = 0
    last_list_depth = 0
    for value in doc_content:
        tempout.write( json.dumps(value,indent=2) + "\n\n\n")
        if verbose: print(json.dumps(value, sort_keys=True, indent=4))

        tag_fxn = handle_para
        if 'paragraph' in value:
            this_text = ''

            # First we deal with if we're in a list.
            if 'bullet' in value['paragraph']:
                # either we're (1)starting a new list, (2)in one (do nothing),
                #  (3)starting a nested one, or (4)finished a nested one.
                lid = value['paragraph']['bullet']['listId']
                if not list_stack:  # 1
                    list_stack.append(lid)
                else:
                    if not lid == list_stack[0]:
                        if not lid in list_stack:   # 3
                            list_stack.append(lid)
                        else:                       # 4
                            x = list_stack.pop()
                            while x != lid: list_stack.pop()
            elif len(list_stack) > 0:
                #  current para isn't a bullet but we still have a list open.
                list_stack = []


            list_depth = len(list_stack)
            deeper = list_depth - last_list_depth
            if deeper > 0:
                answer.append("<ul>" * deeper)
            elif deeper < 0:
                deeper = -1 * deeper
                answer.append("</ul>" * deeper)
            if len(list_stack):
                tag_fxn = handle_li

            # NOW the tag_fxn is either 'para' or 'li'... let's get the styling info next,
            elements = value.get('paragraph').get('elements')
            if 'paragraphStyle' in value.get('paragraph'):
                style = value.get('paragraph').get('paragraphStyle')
                if 'namedStyleType' in style:
                    type = style['namedStyleType']

            # and FINALLY, the actual contents.
            for elem in elements:
                # text content
                this_text += read_paragraph_element_2(elem,type)

                # image content
                if 'inlineObjectElement' in elem:
                    vpi = elem['inlineObjectElement']
                    if 'inlineObjectId' in vpi:
                        ii = vpi['inlineObjectId']
                        if ii in img_lookup:
                            img = img_lookup[ii]
                            h = img_heights[ii]
                            w = img_widths[ii]
                            this_text += '<img src="doc_images/%s" width="%i" height="%i" />' % (img,w,h)


            # Now for something tricky. Call an appropriate handler, based on:
            #  (a) what is the paragraph style type?
            #  (b) is it different from the prev one?

            if last_type=='NORMAL_TEXT' and type!=last_type:
                if this_text.strip():
                    result.append(handle_answer(answer))
                answer = []
                #answer_text = ''

            if type=='HEADING_2' and this_text.strip():
                result.append( handle_sec(this_text) )
                this_text = ''
            elif type=='HEADING_3' and this_text.strip():
                result.append(handle_question(this_text,bracket))
                this_text = ''
            else:
                if this_text.lower().startswith('tags:'):
                    tag_fxn = handle_tags
                if this_text.lower().startswith('icons:'):
                    tag_fxn = handle_icons
                if this_text.strip():
                    answer.append(tag_fxn(this_text))
                this_text = ''
            last_type = type
            last_list_depth = list_depth

        elif 'table' in value:
            pass


    result.append(handle_answer(answer))
    return json.dumps(result,indent=4)

'''

def get_doc_generic(docid, bracket=1, verbose=0):
    return get_doc(docid, bracket, verbose)


if __name__ == "__main__":

    print ('')
    options = { 1: ['download a class into a folder / word file', course_download] ,
                2: ['download multiple classes', multiple_downloads ],
                3: ['convert stuff', pan_testing ],
                4: ['convert md to html', md_to_course ],
                5: ['course download tester', test_forums ],
                6: ['download all a courses pages', grab_course_pages],
                7: ['quick site downloader', download_web],
                8: ['upload modified pages back to Canvas', upload_modified_pages_prompt],
               17: ['repair ezproxy links', repair_ezproxy_links],
               18: ['create pages from html files', make_pages_from_folder],
               19: ['fetch support page', fetch_support_page],
               20: ['create support page', create_support_page],
               21: ['add support page to all shells in semester', add_support_page_full_semester],
               22: ['fetch all modules / items', check_modules_for_old_orientation],
               30: ['media fetch', media_testing],
               40: ['flow grid', flowgrid],
              }

    if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
        resp = int(sys.argv[1])
        print("\n\nPerforming: %s\n\n" % options[resp][0])

    else:
        print ('')
        for key in options:
            print(str(key) + '.\t' + options[key][0])

        print('')
        resp = input('Choose: ')

    # Call the function in the options dict
    options[ int(resp)][1]()