canvasapp/content.py



#saved_titles = json.loads( codecs.open('cache/saved_youtube_titles.json','r','utf-8').read() )
import requests, codecs, os, re, json
from pipelines import header, fetch, url, put_file
from util import clean_title, to_file_friendly, minimal_string
from bs4 import BeautifulSoup as bs
from html.parser import HTMLParser
import tomd, checker
import html2markdown as h2m
import pypandoc
import webbrowser
h = HTMLParser()


DBG = 1

def d(s):
    global DBG
    if DBG: print(s)

def stripper(s):
    REMOVE_ATTRIBUTES = [
    'lang','language','onmouseover','onmouseout','script','style','font',
    'dir','face','size','color','style','class','width','height','hspace',
    'border','valign','align','background','bgcolor','text','link','vlink',
    'alink','cellpadding','cellspacing']

    #doc = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is <i>paragraph</i> <a onmouseout="">one</a>.<p id="secondpara" align="blah">This is <i>paragraph</i> <b>two</b>.</html>'''
    soup = bs(s, features='lxml')
    for tag in soup.recursiveChildGenerator():
        try:
            tag.attrs = {key:value for key,value in tag.attrs.iteritems()
                         if key not in REMOVE_ATTRIBUTES}
        except AttributeError:
            # 'NavigableString' object has no attribute 'attrs'
            pass
    return soup.prettify()

def mycleaner(s):
    s = re.sub(r'<br\s?\/>','\n',s)
    s = re.sub(r'<\/?b>','',s)
    s = re.sub(r' +',' ',s)
    s = re.sub(r'^[\s\t\r\n]+$','',s,flags=re.MULTILINE)
    s = re.sub('^ ','',s)
    return s

def freshdesk():
    path = "C:\\Users\\peter\\Downloads\\freshdesk\\Solutions.xml"
    soup = bs( codecs.open(path,'r','utf-8').read() ,features="lxml")

    outpt = codecs.open('cache/faqs.txt','w')
    out = ""
    for a in soup.find_all('solution-article'):

        print("TITLE\n"+a.find('title').get_text())
        out += a.find('title').get_text()

        """for d in a.find_all('description'):
            #print(d)
            if d:
                d = h.unescape(d.get_text())
                e = stripper(d)
                m = tomd.convert( e )
                m = mycleaner(m)
                print("\nDESCRIPTION\n"+m)"""

        #print("\nWHAT IS THIS?\n" +
        hh = a.find('desc-un-html').get_text()
        d = h.unescape(hh)
        e = stripper(d)
        m = tomd.convert( e )
        m = mycleaner(m)
        print("\nDESCRIPTION\n"+m)
        out += "\n\n" + m + "\n\n"

        print("-----------\n\n")
    outpt.write(out)

# Download everything interesting in a course to a local folder
# Build a master file with the entire class content
def accessible_check(id=""):
    if not id:
        id = input("ID of course to check?  ")
    pagebreak = '\n\n<!-- BREAK -->\n\n'
    verbose = 1

    save_file_types = ['application/pdf','application/docx','image/jpg','image/png','image/gif','image/webp','application/vnd.openxmlformats-officedocument.wordprocessingml.document']

    courseinfo = fetch('/api/v1/courses/' + str(id), verbose )

    item_id_to_index = {}
    items_inorder = ["<font size='24'>" + courseinfo['name'] + "</font>\n\n" + pagebreak,]
    running_index = 1

    modules = fetch('/api/v1/courses/' + str(id) + '/modules',verbose)

    items = []
    for x in range(9000): items.append(0)

    video_link_list = []

    for m in modules:
        items[running_index] = '<h2>%s</h2>%s\n' % ( m['name'], pagebreak )
        running_index += 1

        mod_items = fetch('/api/v1/courses/' + str(id) + '/modules/'+str(m['id'])+'/items', verbose)

        for I in mod_items:

            if I['type'] in ['SubHeader', 'Page', 'Quiz', 'Discussion', 'ExternalUrl' ] or 'content_id' in I:
                running_index += 1

                if I['type'] == 'SubHeader':
                    #print('subheader: ' + str(I))
                    items[running_index] = '<h3>%s</h3>\n' % str(json.dumps(I,indent=2))

                if I['type'] == 'Page':
                    item_id_to_index[ I['page_url'] ] = running_index

                if I['type'] == 'Quiz':
                    item_id_to_index[ I['content_id'] ] = running_index

                if I['type'] == 'Discussion':
                    item_id_to_index[ I['content_id'] ] = running_index

                if I['type'] == 'ExternalUrl':
                    items[running_index] = "<a href='%s'>%s</a><br />\n\n" % (I['external_url'], I['title'])

                # ?
                #if 'content_id' in I:
                #    item_id_to_index[ I['content_id'] ] = running_index
            else:
                print("What is this item? " + str(I))


            #items_inorder.append('<i>Not included: '+ I['title'] + '(a ' + I['type'] + ')</i>\n\n\n' )

            # I['title']
            # I['content_id']
            # I['page_url']
            # I['type']
            # I['published']
    # assignments and files have content_id, pages have page_url

    course_folder = '../course_temps/course_'+id
    index = []
    try:
        os.mkdir(course_folder)
    except:
        print("Course folder exists.")
    ###
    ### FILES
    ###
    files_f = course_folder + '/files'
    headered = 0
    print("\nFILES")
    try:
        os.mkdir(files_f)
    except:
        print(" * Files folder already exists.")

    files = fetch('/api/v1/courses/' + str(id) + '/files', verbose)
    print("LISTING COURSE FILES")
    for f in files:
        for arg in 'filename,content-type,size,url'.split(','):
            if arg=='size':
                f['size'] = str(int(f['size']) / 1000) + 'k'

        if f['content-type'] in save_file_types:
            d(' - %s' % f['filename'])

            if not os.path.exists(files_f + '/' + f['filename']):
                r = requests.get(f['url'],headers=header, stream=True)
                with open(files_f + '/' + f['filename'], 'wb') as fd:
                    for chunk in r.iter_content(chunk_size=128):
                        fd.write(chunk)
            else:
                d(" - already downloaded %s" % files_f + '/' + f['filename'])

            if not headered:
                index.append( ('<br /><b>Files</b><br />') )
                headered = 1
            index.append( ('files/' + f['filename'], f['filename']) )

    ###
    ### PAGES
    ###
    pages_f = course_folder + '/pages'
    headered = 0
    image_count = 0
    print("\nPAGES")
    try:
        os.mkdir(pages_f)
    except:
        print(" * Pages folder already exists.")


    pages = fetch('/api/v1/courses/' + str(id) + '/pages', verbose)
    for p in pages:
        d(' - %s' % p['title'])

        p['title'] = clean_title(p['title'])
        easier_filename = clean_title(p['url'])
        this_page_filename = "%s/%s.html" % (pages_f, easier_filename)
        #for a in 'title,updated_at,published'.split(','):
        #    print(str(p[a]), "\t", end=' ')

        if not headered:
            index.append( ('<br /><b>Pages</b><br />') )
            headered = 1
        index.append( ( 'pages/' + easier_filename + '.html', p['title'] ) )


        if os.path.exists(this_page_filename):
            d(" - already downloaded %s" % this_page_filename)
            this_page_content = open(this_page_filename,'r').read()
        elif re.search(r'eis-prod',p['url']) or re.search(r'gavilan\.ins',p['url']):
            d('   * skipping file behind passwords')
        else:
            t2 = fetch('/api/v1/courses/' + str(id) + '/pages/'+p['url'], verbose)
            if t2 and 'body' in t2 and t2['body']:
                bb = bs(t2['body'],features="lxml")
                a_links = bb.find_all('a')
                for A in a_links:
                    if re.search( r'youtu', A['href']):
                        video_link_list.append( (A['href'], A.text, 'pages/'+easier_filename + ".html") )


                page_images = bb.find_all('img')
                for I in page_images:
                    d('   - %s' % I['src'])
                    if re.search(r'eis-prod',I['src']) or re.search(r'gavilan\.ins',I['src']):
                        d('   * skipping file behind passwords')
                    else:
                        try:
                            r = requests.get(I['src'],headers=header, stream=True)
                            mytype = r.headers['content-type']
                            #print("Response is type: " + str(mytype))
                            r_parts = mytype.split("/")
                            ending = r_parts[-1]

                            with open(pages_f + '/' + str(image_count) + "." + ending, 'wb') as fd:
                                for chunk in r.iter_content(chunk_size=128):
                                    fd.write(chunk)
                            image_count += 1
                        except Exception as e:
                            d( ' * Error downloading page image, %s' % str(e) )

                try:
                    with codecs.open(this_page_filename, 'w','utf-8') as fd:
                        this_page_content = "<h2>%s</h2>\n%s" % ( t2['title'], t2['body'] )
                        fd.write(this_page_content)
                except:
                    d(' * problem writing page content')
                    ## TODO include linked pages even if they aren't in module
            else:
                d('   * nothing returned or bad fetch')
        # write to running log of content in order of module
        if p and p['url'] in item_id_to_index:
            items[  item_id_to_index[ p['url'] ]  ] = this_page_content +'\n\n'+pagebreak
        else:
            d(' -- This page didnt seem to be in the modules list.')


    ###
    ### ASSIGNMENTS
    ###
    headered = 0
    asm_f = course_folder + '/assignments'
    print("\nASSIGNMENTS")
    try:
        os.mkdir(asm_f)
    except:
        d(" - Assignments dir exists")

    asm = fetch('/api/v1/courses/' + str(id) + '/assignments', verbose)
    for p in asm:
        d(' - %s' % p['name'])


        try:
            friendlyfile = to_file_friendly(p['name'])
            this_assmt_filename = asm_f + '/' + str(p['id'])+"_"+ friendlyfile + '.html'
            if os.path.exists(this_assmt_filename):
                d(" - already downloaded %s" % this_assmt_filename)
                this_assmt_content = open(this_assmt_filename,'r').read()
            else:
                t2 = fetch('/api/v1/courses/' + str(id) + '/assignments/'+str(p['id']), verbose)
                with codecs.open(this_assmt_filename, 'w','utf-8') as fd:
                    this_assmt_content = "<h2>%s</h2>\n%s\n\n" % (t2['name'], t2['description'])
                    fd.write(this_assmt_content)
                if not headered:
                    index.append( ('<br /><b>Assignments</b><br />') )
                    headered = 1
                index.append( ('assignments/' + str(p['id'])+"_"+friendlyfile + '.html', p['name']) )

            # write to running log of content in order of module
            if p['id'] in item_id_to_index:
                items[  item_id_to_index[ p['url'] ]  ] = this_assmt_content+'\n\n'+pagebreak
        except Exception as e:
            d(' * Problem %s' % str(e))

    ###
    ### FORUMS
    ###
    """forum_f = course_folder + '/forums'
    headered = 0
    image_count = 0
    print("\nFORUMS")
    try:
        os.mkdir(forum_f)
        forums = fetch('/api/v1/courses/' + str(id) + '/discussion_topics', verbose)
        for p in forums:
            p['title'] = clean_title(p['title'])
            forum_id = p['id']
            easier_filename = p['title']
            for a in 'title,posted_at,published'.split(','):
                print(str(p[a]), "\t", end=' ')
            print("")
            t2 = fetch('/api/v1/courses/' + str(id) + '/discussion_topics/'+str(forum_id), verbose)


            #### REMOVED
            bb = bs(t2['body'],features="lxml")
            print("IMAGES IN THIS PAGE")
            page_images = bb.find_all('img')
            for I in page_images:
                r = requests.get(I['src'],headers=header, stream=True)
                mytype = r.headers['content-type']
                print("Response is type: " + str(mytype))
                r_parts = mytype.split("/")
                ending = r_parts[-1]

                with open(pages_f + '/' + str(image_count) + "." + ending, 'wb') as fd:
                    for chunk in r.iter_content(chunk_size=128):
                        fd.write(chunk)
                image_count += 1
            #### END REMOVED

            try:
                with codecs.open(forum_f + '/' + easier_filename + '.html', 'w','utf-8') as fd:
                    fd.write("<h1>"+t2['title']+"</h1>\n")
                    fd.write(t2['message'])
                if not headered: index.append( ('<br /><b>Discussion Forums</b><br />') )
                headered = 1
                index.append( ( 'forums/' + easier_filename + '.html', p['title'] ) )

                # write to running log of content in order of module
                if p['id'] in item_id_to_index:
                    items_inorder[  item_id_to_index[ p['id'] ]  ] = '<h1>'+t2['title']+'</h1>\n\n'+t2['message']+'\n\n'+pagebreak
                else:
                    print('  This forum didnt seem to be in the modules list.')
            except Exception as e:
                print("Error here:", e)
                #print p
                #print results_dict
    except Exception as e:
        print("** Forum folder seems to exist. Skipping those.")
        print(e)


    ###
    ### QUIZZES
    ###


    # get a list external urls
    headered = 0
    t = url + '/api/v1/courses/' + str(id) + '/modules'
    while t: t = fetch(t)
    mods = results
    results = []
    for m in mods:
        results = []
        t2 = url + '/api/v1/courses/' + str(id) + '/modules/' + str(m['id']) + '/items'
        while t2: t2 = fetch(t2)
        items = results
        for i in items:
            #print i
            if i['type'] == "ExternalUrl":
                #print i
                for j in 'id,title,external_url'.split(','):
                    print unicode(i[j]), "\t",
                print ""
                if not headered: index.append( ('<br /><b>External Links</b><br />') )
                headered = 1
                index.append( (i['external_url'], i['title']) )
    """


    # Create index page of all gathered items
    myindex = codecs.open(course_folder+'/index.html','w','utf-8')
    for i in index:
        if len(i)==2: myindex.write("<a href='"+i[0]+"'>"+i[1]+"</a><br />\n")
        else: myindex.write(i)


    # Full course content in single file
    print("Writing main course files...")
    mycourse = codecs.open(course_folder+'/fullcourse.raw.html','w','utf-8')

    for I in items:
        if I:
            mycourse.write(  I  )


    temp = open('cache/coursedump.txt','w')
    temp.write( "items:  " +  json.dumps(items,indent=2) )
    temp.write("\n\n\n")
    temp.write( "index:  " + json.dumps(index,indent=2) )
    temp.write("\n\n\n")
    temp.write( "items_inorder:  " + json.dumps(items_inorder,indent=2) )
    temp.write("\n\n\n")
    temp.write( "item_id_to_index:  " + json.dumps(item_id_to_index,indent=2) )


    if video_link_list:
        mycourse.write('\n<h1>Videos Linked in Pages</h1>\n<table>')
        for V in video_link_list:
            (url, txt, pg) = V
            mycourse.write("<tr><td><a target='_blank' href='"+url+"'>"+txt+"</a></td><td> on <a target='_blank' href='" + pg + "'>" + pg + "</a></td></tr>\n")
        mycourse.write("</table>\n")

    mycourse.close()
    output = pypandoc.convert_file(course_folder+'/fullcourse.raw.html', 'html', outputfile=course_folder+"/fullcourse.html")
    output1 = pypandoc.convert_file(course_folder+'/fullcourse.html', 'md', outputfile=course_folder+"/fullcourse.md")
    output2 = pypandoc.convert_file(course_folder+'/fullcourse.html', 'docx', outputfile=course_folder+"/fullcourse.docx")


def pan_testing():
    course_folder = '../course_temps/course_6862'
    output3 = pypandoc.convert_file(course_folder+'/fullcourse.md', 'html', outputfile=course_folder+"/fullcourse.v2.html")

# Given course, page url, and new content, upload the new revision of a page
def create_page(course_num,new_title,new_content):
    t3 = url + '/api/v1/courses/' + str(course_num) + '/pages'
    #xyz = raw_input('Enter 1 to continue and send back to: ' + t3 + ': ')
    #print("Creating page: %s\nwith content:%s\n\n\n" % (new_title,new_content))
    print("Creating page: %s" % new_title)
    xyz = input('type 1 to confirm: ')   #'1'
    if xyz=='1':
        data = {'wiki_page[title]':new_title, 'wiki_page[body]':new_content}
        r3 = requests.post(t3, headers=header, params=data)
        print(r3)
        print('ok')


def md_to_course():
    #input = 'C:/Users/peter/Nextcloud/Documents/gavilan/student_orientation.txt'
    #output = 'C:/Users/peter/Nextcloud/Documents/gavilan/stu_orientation/student_orientation.html'
    id = "11214"
    infile = 'cache/pages/course_%s.md' % id
    output = 'cache/pages/course_%s_fixed.html' % id
    output3 = pypandoc.convert_file(infile, 'html', format='md', outputfile=output)

    xx = codecs.open(output,'r','utf-8').read()
    soup = bs(  xx, features="lxml" )
    soup.encode("utf-8")

    current_page = ""
    current_title = ""

    for child in soup.body.children:
        if child.name == "h1" and not current_title:
            current_title = child.get_text()
        elif child.name == "h1":
            upload_page(id,current_title,current_page)
            current_title = child.get_text()
            current_page = ""
            print( "Next page: %s" % current_title )
        else:
            #print(dir(child))
            if 'prettify' in dir(child):
                current_page += child.prettify(formatter="html")
            else:
                current_page += child.string

    upload_page(id,current_title,current_page)
    print("Done")


# DL pages only
def grab_course_pages(course_num=-1):
    global results, results_dict, url, header
    # course_num = raw_input("What is the course id? ")
    if course_num<0:
        course_num = input("Id of course? ")
    else:
        course_num = str(course_num)
    modpagelist = []
    modurllist = []
    # We want things in the order of the modules
    t4 = url + '/api/v1/courses/'+str(course_num)+'/modules?include[]=items'
    results = fetch(t4)
    i = 1
    pageout = codecs.open('cache/pages/course_'+str(course_num)+'.html','w','utf-8')
    pageoutm = codecs.open('cache/pages/course_'+str(course_num)+'.md','w','utf-8')
    divider = "\n### "
    for M in results:
        print("Module Name: " + M['name'])
        for I in M['items']:
            if I['type']=='Page':
                modpagelist.append(I['title'])
                modurllist.append(I['page_url'])
                pageout.write(divider+I['title']+'### '+I['page_url']+'\n')
                easier_filename = clean_title(I['page_url'])
                print("  " + str(i) + ". " + I['title'])
                t2 = url + '/api/v1/courses/' + str(course_num) + '/pages/'+I['page_url']
                print('Getting: ' + t2)
                mypage = fetch(t2)
                fixed = checker.safe_html(mypage['body'])
                if fixed:
                    #markdown = h2m.convert(fixed)
                    #p_data = pandoc.read(mypage['body'])
                    markdown = pypandoc.convert_text("\n<h1>" + I['title'] + "</h1>\n" + mypage['body'], 'md', format='html')
                    pageout.write(fixed+'\n')
                    pageoutm.write(markdown+'\n')
                    pageout.flush()
                i += 1
    pageout.close()
    pageoutm.close()

# Upload pages. Local copy has a particular format.
# Appears to not be used
def put_course_pages():
    course_num = '6862'
    filein = codecs.open('cache/pages/course_'+str(course_num)+'.html','r','utf-8')
    my_titles = []
    my_urls = []
    my_bodys = []
    started = 0
    current_body = ""
    for L in filein.readlines():
        ma = re.search('^###\s(.*)###\s(.*)$',L)
        if ma:
            my_titles.append(ma.group(1))
            my_urls.append(ma.group(2))
            if started:
                my_bodys.append(current_body)
                current_body = ""
            started = 1
        else:
            current_body += "\n" + L
    my_bodys.append(current_body)

    i = 0
    for U in my_urls:
        # and now upload it....lol
        upload_page(course_num,U,my_bodys[i])
        i += 1

# Also not used
def put_revised_pages():
    course_num = '6862'
    course_folder = '../course_temps/course_6862'
    filein = codecs.open(course_folder+'/fullcourse.v2.html','r','utf-8')
    my_titles = []
    my_urls = []
    my_bodys = []
    started = 0
    current_body = ""
    for L in filein.readlines():
        ma = re.search('^<h1>(.*)</h1>.*$',L)
        if ma:
            my_titles.append(ma.group(1))
            my_urls.append(ma.group(2))
            if started:
                my_bodys.append(current_body)
                current_body = ""
            started = 1
        else:
            current_body += "\n" + L
    my_bodys.append(current_body)

    i = 0
    for U in my_urls:
        # and now upload it....lol
        upload_page(course_num,U,my_bodys[i])
        i += 1

# Download, clean html, and reupload page
def update_page():
    global results, results_dict, url, header
    # course_num = raw_input("What is the course id? ")
    course_num = '6862'
    t = url + '/api/v1/courses/' + str(course_num) + '/pages'
    while t: t = fetch(t)
    pages = results
    results = []
    mypagelist = []
    myurllist = []
    modpagelist = []
    modurllist = []
    for p in pages:
        p['title'] = clean_title(p['title'])
        mypagelist.append(p['title'])
        myurllist.append(p['url'])
        easier_filename = clean_title(p['url'])
        #for a in 'title,updated_at,published'.split(','):
        #    print unicode(p[a]), "\t",
        #print ""

    # We want things in the order of the modules
    t4 = url + '/api/v1/courses/'+str(course_num)+'/modules?include[]=items'
    while t4: t4 = fetch(t4)
    mods = results
    results = []
    i = 1
    print("\nWhat page do you want to repair?")
    for M in mods:
        print("Module Name: " + M['name'])
        for I in M['items']:
            if I['type']=='Page':
                modpagelist.append(I['title'])
                modurllist.append(I['page_url'])
                print("  " + str(i) + ". " + I['title'])
                i += 1

    choice = input("\n> ")
    choice = int(choice) - 1
    chosen_url = modurllist[choice]
    print('Fetching: ' + modpagelist[choice])
    t2 = url + '/api/v1/courses/' + str(course_num) + '/pages/'+chosen_url
    print('From: ' + t2)

    results_dict = {}
    while(t2): t2 = fetch(t2)
    mypage = results_dict
    fixed_page = checker.safe_html(mypage['body'])
    upload_page(course_num,chosen_url,fixed_page)

# Given course, page url, and new content, upload the new revision of a page
def upload_page(course_num,pageurl,new_content):
    print("Repaired page:\n\n")
    #print new_content
    print(pageurl)
    t3 = url + '/api/v1/courses/' + str(course_num) + '/pages/' + pageurl
    xyz = input('Enter 1 to continue and send back to: ' + t3 + ': ')
    #xyz = '1'
    if xyz=='1':
        data = {'wiki_page[body]':new_content}
        r3 = requests.put(t3, headers=header, params=data)
        print(r3)
        print('ok')

# Use template to build html page with homegrown subtitles
def build_srt_embed_php(data):
    template = codecs.open('template_srt_and_video.txt','r','utf-8').readlines()
    result = ''
    for L in template:
        L = re.sub('FRAMEID',data['frameid'],L)
        L = re.sub('TITLE',data['title'],L)
        L = re.sub('EMBEDLINK',data['embedlink'],L)
        L = re.sub('SRTFOLDERFILE',data['srtfolderfile'],L)
        result += L
    return result


def yt_title(code):
    global saved_titles
    if code in saved_titles:
        return saved_titles[code]
    a = requests.get('https://www.youtube.com/watch?v=%s' % code)
    bbb = bs(a.content,"lxml")
    ccc = bbb.find('title').text
    ccc = re.sub(r'\s\-\sYouTube','',ccc)
    saved_titles[code] = ccc
    codecs.open('saved_youtube_titles.json','w','utf-8').write(json.dumps(saved_titles))
    return ccc

def swap_youtube_subtitles():
    # example here:  http://siloor.github.io/youtube.external.subtitle/examples/srt/

    # srt folder, look at all filenames
    srtlist = os.listdir('video_srt')
    i = 0
    for V in srtlist:
        print(str(i) + '.  ' + V)
        i += 1
    choice = input("Which SRT folder? ")
    choice = srtlist[int(choice)]
    srt_folder = 'video_srt/'+choice
    class_srt_folder = choice
    srt_files = os.listdir(srt_folder)
    srt_shorts = {}
    print("\nThese are the subtitle files: " + str(srt_files))
    for V in srt_files:
        if V.endswith('srt'):
            V1 = re.sub(r'(\.\w+$)','',V)
            srt_shorts[V] = minimal_string(V1)

    crs_id = input("What is the id of the course?  ")
    grab_course_pages(crs_id)
    v1_pages = codecs.open('page_revisions/course_'+str(crs_id)+'.html','r','utf-8')
    v1_content = v1_pages.read()

    # a temporary page of all youtube links
    tp = codecs.open('page_revisions/links_' + str(crs_id) + '.html', 'w','utf-8')

    # course pages, get them all and look for youtube embeds
    title_shorts = {}
    title_embedlink = {}
    title_list = []
    print("I'm looking for iframes and youtube links.")
    for L in v1_content.split('\n'):
        if re.search('<a.*?href="https:\/\/youtu',L):
            print("Possibly there's a linked video instead of embedded:" + L)
        if re.search('iframe',L):
            ma = re.compile('(\w+)=(".*?")')
            #print "\n"
            this_title = ''
            for g in ma.findall(L):
                print(g)
                if g[0]=='title':
                    this_title = g[1].replace('"','')
                if g[0]=='src':
                    this_src = g[1].replace('"','')
                #print g
            if not this_title:
                tmp = re.search(r'embed\/(.*?)\?',this_src)
                if not tmp: tmp = re.search(r'embed\/(.*?)$',this_src)
                if tmp:
                    this_title = yt_title(tmp.groups()[0])
            title_shorts[this_title] = minimal_string(this_title)
            title_list.append(this_title)
            title_embedlink[this_title] = this_src
            print("%s\n" % this_title.encode('ascii','ignore'))
            tp.write(  "%s<br><a target='_blank' href='%s'>%s</a><br /><br />" % (this_title, this_src, this_src) )
    # match them
    # lowercase, non alpha or num chars become a single space, try to match
    # if any srts remain unmatched, ask.
    tp.close()
    webbrowser.open_new_tab('file://C:/SCRIPTS/everything-json/page_revisions/links_'+str(crs_id)+'.html')

    matches = {}                    # key is Title, value is srt file
    for S,v in list(srt_shorts.items()):
        found_match = 0
        print(v, end=' ')
        for T, Tv in list(title_shorts.items()):
            if v == Tv:
                print(' \tMatches: ' + T, end=' ')
                found_match = 1
                matches[T] = S
                break
        #print "\n"

    print("\nThese are the srt files: ")
    print(json.dumps(srt_shorts,indent=2))
    print("\nThese are the titles: ")
    print(json.dumps(title_shorts,indent=2))
    print("\nThese are the matches: ")
    print(json.dumps(matches,indent=2))

    print(("There are %d SRT files and %d VIDEOS found. " % ( len(list(srt_shorts.keys())), len(list(title_shorts.keys())) ) ))

    for S,v in list(srt_shorts.items()):
        if not S in list(matches.values()):
            print("\nDidn't find a match for: " + S)
            i = 0
            for T in title_list:
                if not T in list(matches.keys()): print(str(i+1) + ". " + T.encode('ascii', 'ignore'))
                i += 1
            print("Here's the first few lines of the SRT:")
            print((  re.sub(r'\s+',' ', '\n'.join(open(srt_folder+"/"+S,'r').readlines()[0:10]))+"\n\n"))
            choice = input("Which one should I match it to? (zero for no match)  ")
            if int(choice)>0:
                matches[ title_list[ int(choice)-1 ] ] = S
                print("SRT clean name was: %s, and TITLE clean name was: %s" % (v,title_shorts[title_list[ int(choice)-1 ]] ))
    print("ok, here are the matches:")
    print(json.dumps(matches,indent=2))

    # construct subsidiary pages, upload them
    i = 0
    for m,v in list(matches.items()):
        # open template
        # do replacement
        i += 1
        data = {'frameid':'videoframe'+str(i), 'title':m, 'embedlink':title_embedlink[m], 'srtfolderfile':v  }
        print(json.dumps(data,indent=2))
        file_part = v.split('.')[0]
        new_php = codecs.open(srt_folder + '/' + file_part + '.php','w','utf-8')
        new_php.write(build_srt_embed_php(data))
        new_php.close()
    #srt_files = os.listdir(srt_folder)
    put_file(class_srt_folder)


def test_swap():
    crs_id = '6923'
    # swap in embed code and re-upload canvas pages
    v2_pages = codecs.open('page_revisions/course_'+str(crs_id)+'.html','r','utf-8')
    v2_content = v2_pages.read()
    ma = re.compile('(\w+)=(".*?")')

    for L in v2_content.split('\n'):
        find = re.findall('<iframe(.*?)>',L)
        if find:
            print("Found: ", find)
            for each in find:
                #print "\n"
                this_title = ''
                this_src = ''
                for g in ma.findall(each):
                    #print g
                    if g[0]=='title':
                        this_title = g[1].replace('"','')
                    if g[0]=='src':
                        this_src = g[1].replace('"','')
                    #print g
                if not this_title:
                    tmp = re.search(r'embed\/(.*?)\?',this_src)
                    if not tmp: tmp = re.search(r'embed\/(.*?)$',this_src)
                    if tmp:
                        this_title = yt_title(tmp.groups()[0])
                print("Found embed link: %s\n and title: %s\n" % (this_src,this_title.encode('ascii','ignore')))


def multiple_downloads():

    x = input("What IDs? Separate with one space: ")
    for id in x.split(" "):
        accessible_check(id)


if __name__ == "__main__":

    print ('')
    options = { 1: ['download a class into a folder / word file', accessible_check] ,
                2: ['download multiple classes', multiple_downloads ],
                3: ['convert stuff', pan_testing ],
                4: ['convert md to html', md_to_course ],
                5: ['import freshdesk content', freshdesk ],
                6: ['download all a courses pages', grab_course_pages],
              }

    for key in options:
        print(str(key) + '.\t' + options[key][0])

    print('')
    resp = input('Choose: ')

    # Call the function in the options dict
    options[ int(resp)][1]()