canvasapp/content.py



#saved_titles = json.loads( codecs.open('cache/saved_youtube_titles.json','r','utf-8').read() )
import requests, codecs, os, re, json
from pipelines import header, fetch, url, put_file
from util import clean_title, to_file_friendly, minimal_string, stripper, mycleaner
from bs4 import BeautifulSoup as bs
from html.parser import HTMLParser
import tomd, checker
import html2markdown as h2m
import pypandoc
import webbrowser
h = HTMLParser()


DBG = 1

def d(s):
    global DBG
    if DBG: print(s)


# Download everything interesting in a course to a local folder
# Build a master file with the entire class content
def accessible_check(id=""):
    if not id:
        id = input("ID of course to check?  ")
    pagebreak = '\n\n<!-- BREAK -->\n\n'
    verbose = 1

    save_file_types = ['application/pdf','application/docx','image/jpg','image/png','image/gif','image/webp','application/vnd.openxmlformats-officedocument.wordprocessingml.document']

    courseinfo = fetch('/api/v1/courses/' + str(id), verbose )

    item_id_to_index = {}
    items_inorder = ["<font size='24'>" + courseinfo['name'] + "</font>\n\n" + pagebreak,]
    running_index = 1

    modules = fetch('/api/v1/courses/' + str(id) + '/modules',verbose)

    items = []
    for x in range(9000): items.append(0)

    video_link_list = []

    for m in modules:
        items[running_index] = '<h2>%s</h2>%s\n' % ( m['name'], pagebreak )
        running_index += 1

        mod_items = fetch('/api/v1/courses/' + str(id) + '/modules/'+str(m['id'])+'/items', verbose)

        for I in mod_items:

            if I['type'] in ['SubHeader', 'Page', 'Quiz', 'Discussion', 'ExternalUrl' ] or 'content_id' in I:
                running_index += 1

                if I['type'] == 'SubHeader':
                    #print('subheader: ' + str(I))
                    items[running_index] = '<h3>%s</h3>\n' % str(json.dumps(I,indent=2))

                if I['type'] == 'Page':
                    item_id_to_index[ I['page_url'] ] = running_index

                if I['type'] == 'Quiz':
                    item_id_to_index[ I['content_id'] ] = running_index

                if I['type'] == 'Discussion':
                    item_id_to_index[ I['content_id'] ] = running_index

                if I['type'] == 'ExternalUrl':
                    items[running_index] = "<a href='%s'>%s</a><br />\n\n" % (I['external_url'], I['title'])

                # ?
                #if 'content_id' in I:
                #    item_id_to_index[ I['content_id'] ] = running_index
            else:
                print("What is this item? " + str(I))


            #items_inorder.append('<i>Not included: '+ I['title'] + '(a ' + I['type'] + ')</i>\n\n\n' )

            # I['title']
            # I['content_id']
            # I['page_url']
            # I['type']
            # I['published']
    # assignments and files have content_id, pages have page_url

    course_folder = '../course_temps/course_'+id
    index = []
    try:
        os.mkdir(course_folder)
    except:
        print("Course folder exists.")
    ###
    ### FILES
    ###
    files_f = course_folder + '/files'
    headered = 0
    print("\nFILES")
    try:
        os.mkdir(files_f)
    except:
        print(" * Files folder already exists.")

    files = fetch('/api/v1/courses/' + str(id) + '/files', verbose)
    print("LISTING COURSE FILES")
    for f in files:
        for arg in 'filename,content-type,size,url'.split(','):
            if arg=='size':
                f['size'] = str(int(f['size']) / 1000) + 'k'

        if f['content-type'] in save_file_types:
            d(' - %s' % f['filename'])

            if not os.path.exists(files_f + '/' + f['filename']):
                r = requests.get(f['url'],headers=header, stream=True)
                with open(files_f + '/' + f['filename'], 'wb') as fd:
                    for chunk in r.iter_content(chunk_size=128):
                        fd.write(chunk)
            else:
                d(" - already downloaded %s" % files_f + '/' + f['filename'])

            if not headered:
                index.append( ('<br /><b>Files</b><br />') )
                headered = 1
            index.append( ('files/' + f['filename'], f['filename']) )

    ###
    ### PAGES
    ###
    pages_f = course_folder + '/pages'
    headered = 0
    image_count = 0
    print("\nPAGES")
    try:
        os.mkdir(pages_f)
    except:
        print(" * Pages folder already exists.")


    pages = fetch('/api/v1/courses/' + str(id) + '/pages', verbose)
    for p in pages:
        d(' - %s' % p['title'])

        p['title'] = clean_title(p['title'])
        easier_filename = clean_title(p['url'])
        this_page_filename = "%s/%s.html" % (pages_f, easier_filename)
        #for a in 'title,updated_at,published'.split(','):
        #    print(str(p[a]), "\t", end=' ')

        if not headered:
            index.append( ('<br /><b>Pages</b><br />') )
            headered = 1
        index.append( ( 'pages/' + easier_filename + '.html', p['title'] ) )


        if os.path.exists(this_page_filename):
            d(" - already downloaded %s" % this_page_filename)
            this_page_content = open(this_page_filename,'r').read()
        elif re.search(r'eis-prod',p['url']) or re.search(r'gavilan\.ins',p['url']):
            d('   * skipping file behind passwords')
        else:
            t2 = fetch('/api/v1/courses/' + str(id) + '/pages/'+p['url'], verbose)
            if t2 and 'body' in t2 and t2['body']:
                bb = bs(t2['body'],features="lxml")
                a_links = bb.find_all('a')
                for A in a_links:
                    if re.search( r'youtu', A['href']):
                        video_link_list.append( (A['href'], A.text, 'pages/'+easier_filename + ".html") )


                page_images = bb.find_all('img')
                for I in page_images:
                    d('   - %s' % I['src'])
                    if re.search(r'eis-prod',I['src']) or re.search(r'gavilan\.ins',I['src']):
                        d('   * skipping file behind passwords')
                    else:
                        try:
                            r = requests.get(I['src'],headers=header, stream=True)
                            mytype = r.headers['content-type']
                            #print("Response is type: " + str(mytype))
                            r_parts = mytype.split("/")
                            ending = r_parts[-1]

                            with open(pages_f + '/' + str(image_count) + "." + ending, 'wb') as fd:
                                for chunk in r.iter_content(chunk_size=128):
                                    fd.write(chunk)
                            image_count += 1
                        except Exception as e:
                            d( ' * Error downloading page image, %s' % str(e) )

                try:
                    with codecs.open(this_page_filename, 'w','utf-8') as fd:
                        this_page_content = "<h2>%s</h2>\n%s" % ( t2['title'], t2['body'] )
                        fd.write(this_page_content)
                except:
                    d(' * problem writing page content')
                    ## TODO include linked pages even if they aren't in module
            else:
                d('   * nothing returned or bad fetch')
        # write to running log of content in order of module
        if p and p['url'] in item_id_to_index:
            items[  item_id_to_index[ p['url'] ]  ] = this_page_content +'\n\n'+pagebreak
        else:
            d(' -- This page didnt seem to be in the modules list.')


    ###
    ### ASSIGNMENTS
    ###
    headered = 0
    asm_f = course_folder + '/assignments'
    print("\nASSIGNMENTS")
    try:
        os.mkdir(asm_f)
    except:
        d(" - Assignments dir exists")

    asm = fetch('/api/v1/courses/' + str(id) + '/assignments', verbose)
    for p in asm:
        d(' - %s' % p['name'])


        try:
            friendlyfile = to_file_friendly(p['name'])
            this_assmt_filename = asm_f + '/' + str(p['id'])+"_"+ friendlyfile + '.html'
            if os.path.exists(this_assmt_filename):
                d(" - already downloaded %s" % this_assmt_filename)
                this_assmt_content = open(this_assmt_filename,'r').read()
            else:
                t2 = fetch('/api/v1/courses/' + str(id) + '/assignments/'+str(p['id']), verbose)
                with codecs.open(this_assmt_filename, 'w','utf-8') as fd:
                    this_assmt_content = "<h2>%s</h2>\n%s\n\n" % (t2['name'], t2['description'])
                    fd.write(this_assmt_content)
                if not headered:
                    index.append( ('<br /><b>Assignments</b><br />') )
                    headered = 1
                index.append( ('assignments/' + str(p['id'])+"_"+friendlyfile + '.html', p['name']) )

            # write to running log of content in order of module
            if p['id'] in item_id_to_index:
                items[  item_id_to_index[ p['url'] ]  ] = this_assmt_content+'\n\n'+pagebreak
        except Exception as e:
            d(' * Problem %s' % str(e))

    ###
    ### FORUMS
    ###
    """forum_f = course_folder + '/forums'
    headered = 0
    image_count = 0
    print("\nFORUMS")
    try:
        os.mkdir(forum_f)
        forums = fetch('/api/v1/courses/' + str(id) + '/discussion_topics', verbose)
        for p in forums:
            p['title'] = clean_title(p['title'])
            forum_id = p['id']
            easier_filename = p['title']
            for a in 'title,posted_at,published'.split(','):
                print(str(p[a]), "\t", end=' ')
            print("")
            t2 = fetch('/api/v1/courses/' + str(id) + '/discussion_topics/'+str(forum_id), verbose)


            #### REMOVED
            bb = bs(t2['body'],features="lxml")
            print("IMAGES IN THIS PAGE")
            page_images = bb.find_all('img')
            for I in page_images:
                r = requests.get(I['src'],headers=header, stream=True)
                mytype = r.headers['content-type']
                print("Response is type: " + str(mytype))
                r_parts = mytype.split("/")
                ending = r_parts[-1]

                with open(pages_f + '/' + str(image_count) + "." + ending, 'wb') as fd:
                    for chunk in r.iter_content(chunk_size=128):
                        fd.write(chunk)
                image_count += 1
            #### END REMOVED

            try:
                with codecs.open(forum_f + '/' + easier_filename + '.html', 'w','utf-8') as fd:
                    fd.write("<h1>"+t2['title']+"</h1>\n")
                    fd.write(t2['message'])
                if not headered: index.append( ('<br /><b>Discussion Forums</b><br />') )
                headered = 1
                index.append( ( 'forums/' + easier_filename + '.html', p['title'] ) )

                # write to running log of content in order of module
                if p['id'] in item_id_to_index:
                    items_inorder[  item_id_to_index[ p['id'] ]  ] = '<h1>'+t2['title']+'</h1>\n\n'+t2['message']+'\n\n'+pagebreak
                else:
                    print('  This forum didnt seem to be in the modules list.')
            except Exception as e:
                print("Error here:", e)
                #print p
                #print results_dict
    except Exception as e:
        print("** Forum folder seems to exist. Skipping those.")
        print(e)


    ###
    ### QUIZZES
    ###


    # get a list external urls
    headered = 0
    t = url + '/api/v1/courses/' + str(id) + '/modules'
    while t: t = fetch(t)
    mods = results
    results = []
    for m in mods:
        results = []
        t2 = url + '/api/v1/courses/' + str(id) + '/modules/' + str(m['id']) + '/items'
        while t2: t2 = fetch(t2)
        items = results
        for i in items:
            #print i
            if i['type'] == "ExternalUrl":
                #print i
                for j in 'id,title,external_url'.split(','):
                    print unicode(i[j]), "\t",
                print ""
                if not headered: index.append( ('<br /><b>External Links</b><br />') )
                headered = 1
                index.append( (i['external_url'], i['title']) )
    """


    # Create index page of all gathered items
    myindex = codecs.open(course_folder+'/index.html','w','utf-8')
    for i in index:
        if len(i)==2: myindex.write("<a href='"+i[0]+"'>"+i[1]+"</a><br />\n")
        else: myindex.write(i)


    # Full course content in single file
    print("Writing main course files...")
    mycourse = codecs.open(course_folder+'/fullcourse.raw.html','w','utf-8')

    for I in items:
        if I:
            mycourse.write(  I  )


    temp = open('cache/coursedump.txt','w')
    temp.write( "items:  " +  json.dumps(items,indent=2) )
    temp.write("\n\n\n")
    temp.write( "index:  " + json.dumps(index,indent=2) )
    temp.write("\n\n\n")
    temp.write( "items_inorder:  " + json.dumps(items_inorder,indent=2) )
    temp.write("\n\n\n")
    temp.write( "item_id_to_index:  " + json.dumps(item_id_to_index,indent=2) )


    if video_link_list:
        mycourse.write('\n<h1>Videos Linked in Pages</h1>\n<table>')
        for V in video_link_list:
            (url, txt, pg) = V
            mycourse.write("<tr><td><a target='_blank' href='"+url+"'>"+txt+"</a></td><td> on <a target='_blank' href='" + pg + "'>" + pg + "</a></td></tr>\n")
        mycourse.write("</table>\n")

    mycourse.close()
    output = pypandoc.convert_file(course_folder+'/fullcourse.raw.html', 'html', outputfile=course_folder+"/fullcourse.html")
    output1 = pypandoc.convert_file(course_folder+'/fullcourse.html', 'md', outputfile=course_folder+"/fullcourse.md")
    output2 = pypandoc.convert_file(course_folder+'/fullcourse.html', 'docx', outputfile=course_folder+"/fullcourse.docx")


def pan_testing():
    course_folder = '../course_temps/course_6862'
    output3 = pypandoc.convert_file(course_folder+'/fullcourse.md', 'html', outputfile=course_folder+"/fullcourse.v2.html")

# Given course, page url, and new content, upload the new revision of a page
def create_page(course_num,new_title,new_content):
    t3 = url + '/api/v1/courses/' + str(course_num) + '/pages'
    #xyz = raw_input('Enter 1 to continue and send back to: ' + t3 + ': ')
    #print("Creating page: %s\nwith content:%s\n\n\n" % (new_title,new_content))
    print("Creating page: %s" % new_title)
    xyz = input('type 1 to confirm: ')   #'1'
    if xyz=='1':
        data = {'wiki_page[title]':new_title, 'wiki_page[body]':new_content}
        r3 = requests.post(t3, headers=header, params=data)
        print(r3)
        print('ok')


def md_to_course():
    #input = 'C:/Users/peter/Nextcloud/Documents/gavilan/student_orientation.txt'
    #output = 'C:/Users/peter/Nextcloud/Documents/gavilan/stu_orientation/student_orientation.html'
    id = "11214"
    infile = 'cache/pages/course_%s.md' % id
    output = 'cache/pages/course_%s_fixed.html' % id
    output3 = pypandoc.convert_file(infile, 'html', format='md', outputfile=output)

    xx = codecs.open(output,'r','utf-8').read()
    soup = bs(  xx, features="lxml" )
    soup.encode("utf-8")

    current_page = ""
    current_title = ""

    for child in soup.body.children:
        if child.name == "h1" and not current_title:
            current_title = child.get_text()
        elif child.name == "h1":
            upload_page(id,current_title,current_page)
            current_title = child.get_text()
            current_page = ""
            print( "Next page: %s" % current_title )
        else:
            #print(dir(child))
            if 'prettify' in dir(child):
                current_page += child.prettify(formatter="html")
            else:
                current_page += child.string

    upload_page(id,current_title,current_page)
    print("Done")


# DL pages only
def grab_course_pages(course_num=-1):
    global results, results_dict, url, header
    # course_num = raw_input("What is the course id? ")
    if course_num<0:
        course_num = input("Id of course? ")
    else:
        course_num = str(course_num)
    modpagelist = []
    modurllist = []
    # We want things in the order of the modules
    t4 = url + '/api/v1/courses/'+str(course_num)+'/modules?include[]=items'
    results = fetch(t4)
    i = 1
    pageout = codecs.open('cache/pages/course_'+str(course_num)+'.html','w','utf-8')
    pageoutm = codecs.open('cache/pages/course_'+str(course_num)+'.md','w','utf-8')
    divider = "\n### "
    for M in results:
        print("Module Name: " + M['name'])
        for I in M['items']:
            if I['type']=='Page':
                modpagelist.append(I['title'])
                modurllist.append(I['page_url'])
                pageout.write(divider+I['title']+'### '+I['page_url']+'\n')
                easier_filename = clean_title(I['page_url'])
                print("  " + str(i) + ". " + I['title'])
                t2 = url + '/api/v1/courses/' + str(course_num) + '/pages/'+I['page_url']
                print('Getting: ' + t2)
                mypage = fetch(t2)
                fixed = checker.safe_html(mypage['body'])
                if fixed:
                    #markdown = h2m.convert(fixed)
                    #p_data = pandoc.read(mypage['body'])
                    markdown = pypandoc.convert_text("\n<h1>" + I['title'] + "</h1>\n" + mypage['body'], 'md', format='html')
                    pageout.write(fixed+'\n')
                    pageoutm.write(markdown+'\n')
                    pageout.flush()
                i += 1
    pageout.close()
    pageoutm.close()

# Download, clean html, and reupload page
def update_page():
    global results, results_dict, url, header
    # course_num = raw_input("What is the course id? ")
    course_num = '6862'
    t = url + '/api/v1/courses/' + str(course_num) + '/pages'
    while t: t = fetch(t)
    pages = results
    results = []
    mypagelist = []
    myurllist = []
    modpagelist = []
    modurllist = []
    for p in pages:
        p['title'] = clean_title(p['title'])
        mypagelist.append(p['title'])
        myurllist.append(p['url'])
        easier_filename = clean_title(p['url'])
        #for a in 'title,updated_at,published'.split(','):
        #    print unicode(p[a]), "\t",
        #print ""

    # We want things in the order of the modules
    t4 = url + '/api/v1/courses/'+str(course_num)+'/modules?include[]=items'
    while t4: t4 = fetch(t4)
    mods = results
    results = []
    i = 1
    print("\nWhat page do you want to repair?")
    for M in mods:
        print("Module Name: " + M['name'])
        for I in M['items']:
            if I['type']=='Page':
                modpagelist.append(I['title'])
                modurllist.append(I['page_url'])
                print("  " + str(i) + ". " + I['title'])
                i += 1

    choice = input("\n> ")
    choice = int(choice) - 1
    chosen_url = modurllist[choice]
    print('Fetching: ' + modpagelist[choice])
    t2 = url + '/api/v1/courses/' + str(course_num) + '/pages/'+chosen_url
    print('From: ' + t2)

    results_dict = {}
    while(t2): t2 = fetch(t2)
    mypage = results_dict
    fixed_page = checker.safe_html(mypage['body'])
    upload_page(course_num,chosen_url,fixed_page)

# Given course, page url, and new content, upload the new revision of a page
def upload_page(course_num,pageurl,new_content):
    print("Repaired page:\n\n")
    #print new_content
    print(pageurl)
    t3 = url + '/api/v1/courses/' + str(course_num) + '/pages/' + pageurl
    xyz = input('Enter 1 to continue and send back to: ' + t3 + ': ')
    #xyz = '1'
    if xyz=='1':
        data = {'wiki_page[body]':new_content}
        r3 = requests.put(t3, headers=header, params=data)
        print(r3)
        print('ok')

# Use template to build html page with homegrown subtitles
def build_srt_embed_php(data):
    template = codecs.open('template_srt_and_video.txt','r','utf-8').readlines()
    result = ''
    for L in template:
        L = re.sub('FRAMEID',data['frameid'],L)
        L = re.sub('TITLE',data['title'],L)
        L = re.sub('EMBEDLINK',data['embedlink'],L)
        L = re.sub('SRTFOLDERFILE',data['srtfolderfile'],L)
        result += L
    return result


def yt_title(code):
    global saved_titles
    if code in saved_titles:
        return saved_titles[code]
    a = requests.get('https://www.youtube.com/watch?v=%s' % code)
    bbb = bs(a.content,"lxml")
    ccc = bbb.find('title').text
    ccc = re.sub(r'\s\-\sYouTube','',ccc)
    saved_titles[code] = ccc
    codecs.open('saved_youtube_titles.json','w','utf-8').write(json.dumps(saved_titles))
    return ccc

def swap_youtube_subtitles():
    # example here:  http://siloor.github.io/youtube.external.subtitle/examples/srt/

    # srt folder, look at all filenames
    srtlist = os.listdir('video_srt')
    i = 0
    for V in srtlist:
        print(str(i) + '.  ' + V)
        i += 1
    choice = input("Which SRT folder? ")
    choice = srtlist[int(choice)]
    srt_folder = 'video_srt/'+choice
    class_srt_folder = choice
    srt_files = os.listdir(srt_folder)
    srt_shorts = {}
    print("\nThese are the subtitle files: " + str(srt_files))
    for V in srt_files:
        if V.endswith('srt'):
            V1 = re.sub(r'(\.\w+$)','',V)
            srt_shorts[V] = minimal_string(V1)

    crs_id = input("What is the id of the course?  ")
    grab_course_pages(crs_id)
    v1_pages = codecs.open('page_revisions/course_'+str(crs_id)+'.html','r','utf-8')
    v1_content = v1_pages.read()

    # a temporary page of all youtube links
    tp = codecs.open('page_revisions/links_' + str(crs_id) + '.html', 'w','utf-8')

    # course pages, get them all and look for youtube embeds
    title_shorts = {}
    title_embedlink = {}
    title_list = []
    print("I'm looking for iframes and youtube links.")
    for L in v1_content.split('\n'):
        if re.search('<a.*?href="https:\/\/youtu',L):
            print("Possibly there's a linked video instead of embedded:" + L)
        if re.search('iframe',L):
            ma = re.compile('(\w+)=(".*?")')
            #print "\n"
            this_title = ''
            for g in ma.findall(L):
                print(g)
                if g[0]=='title':
                    this_title = g[1].replace('"','')
                if g[0]=='src':
                    this_src = g[1].replace('"','')
                #print g
            if not this_title:
                tmp = re.search(r'embed\/(.*?)\?',this_src)
                if not tmp: tmp = re.search(r'embed\/(.*?)$',this_src)
                if tmp:
                    this_title = yt_title(tmp.groups()[0])
            title_shorts[this_title] = minimal_string(this_title)
            title_list.append(this_title)
            title_embedlink[this_title] = this_src
            print("%s\n" % this_title.encode('ascii','ignore'))
            tp.write(  "%s<br><a target='_blank' href='%s'>%s</a><br /><br />" % (this_title, this_src, this_src) )
    # match them
    # lowercase, non alpha or num chars become a single space, try to match
    # if any srts remain unmatched, ask.
    tp.close()
    webbrowser.open_new_tab('file://C:/SCRIPTS/everything-json/page_revisions/links_'+str(crs_id)+'.html')

    matches = {}                    # key is Title, value is srt file
    for S,v in list(srt_shorts.items()):
        found_match = 0
        print(v, end=' ')
        for T, Tv in list(title_shorts.items()):
            if v == Tv:
                print(' \tMatches: ' + T, end=' ')
                found_match = 1
                matches[T] = S
                break
        #print "\n"

    print("\nThese are the srt files: ")
    print(json.dumps(srt_shorts,indent=2))
    print("\nThese are the titles: ")
    print(json.dumps(title_shorts,indent=2))
    print("\nThese are the matches: ")
    print(json.dumps(matches,indent=2))

    print(("There are %d SRT files and %d VIDEOS found. " % ( len(list(srt_shorts.keys())), len(list(title_shorts.keys())) ) ))

    for S,v in list(srt_shorts.items()):
        if not S in list(matches.values()):
            print("\nDidn't find a match for: " + S)
            i = 0
            for T in title_list:
                if not T in list(matches.keys()): print(str(i+1) + ". " + T.encode('ascii', 'ignore'))
                i += 1
            print("Here's the first few lines of the SRT:")
            print((  re.sub(r'\s+',' ', '\n'.join(open(srt_folder+"/"+S,'r').readlines()[0:10]))+"\n\n"))
            choice = input("Which one should I match it to? (zero for no match)  ")
            if int(choice)>0:
                matches[ title_list[ int(choice)-1 ] ] = S
                print("SRT clean name was: %s, and TITLE clean name was: %s" % (v,title_shorts[title_list[ int(choice)-1 ]] ))
    print("ok, here are the matches:")
    print(json.dumps(matches,indent=2))

    # construct subsidiary pages, upload them
    i = 0
    for m,v in list(matches.items()):
        # open template
        # do replacement
        i += 1
        data = {'frameid':'videoframe'+str(i), 'title':m, 'embedlink':title_embedlink[m], 'srtfolderfile':v  }
        print(json.dumps(data,indent=2))
        file_part = v.split('.')[0]
        new_php = codecs.open(srt_folder + '/' + file_part + '.php','w','utf-8')
        new_php.write(build_srt_embed_php(data))
        new_php.close()
    #srt_files = os.listdir(srt_folder)
    put_file(class_srt_folder)


def test_swap():
    crs_id = '6923'
    # swap in embed code and re-upload canvas pages
    v2_pages = codecs.open('page_revisions/course_'+str(crs_id)+'.html','r','utf-8')
    v2_content = v2_pages.read()
    ma = re.compile('(\w+)=(".*?")')

    for L in v2_content.split('\n'):
        find = re.findall('<iframe(.*?)>',L)
        if find:
            print("Found: ", find)
            for each in find:
                #print "\n"
                this_title = ''
                this_src = ''
                for g in ma.findall(each):
                    #print g
                    if g[0]=='title':
                        this_title = g[1].replace('"','')
                    if g[0]=='src':
                        this_src = g[1].replace('"','')
                    #print g
                if not this_title:
                    tmp = re.search(r'embed\/(.*?)\?',this_src)
                    if not tmp: tmp = re.search(r'embed\/(.*?)$',this_src)
                    if tmp:
                        this_title = yt_title(tmp.groups()[0])
                print("Found embed link: %s\n and title: %s\n" % (this_src,this_title.encode('ascii','ignore')))


def multiple_downloads():

    x = input("What IDs? Separate with one space: ")
    for id in x.split(" "):
        accessible_check(id)


###
###
### Text / Knowledge Base
###
### How about downloading all possible info / webpages / sources
### related to Gavilan and creating a master search index?
###
### Goals:
### - Scripted approach to allow re-indexing / updating
### - Break everything down into paragraphs
###
### - Script to extract keywords, topics, entities, summaries, questions answered
###   from each paragraph or chunk.
### - Use spacy, gensim, nltk, or gpt-3, or a combination of all of them
###
### - Create vector / embeddings for each paragraph
###
### - Enable a vector search engine and connect to front page of gavilan.cc
### - Use that to feed handful of source paragraphs (& prompt) into gpt and
###   receive text answers to questions.

def demo_vector_search():
    from gensim.models import Word2Vec
    from gensim.utils import simple_preprocess
    import nltk.data
    import spacy

    # (might have to upgrade pip first...)
    # pip install --upgrade click
    #
    # python -m spacy download en_core_web_sm
    # python -m spacy download en_core_web_lg

    def is_complete_sentence(text):
        #text = text.text
        doc = nlp(text)
        sentences = list(doc.sents)
        if len(sentences) == 1 and text.strip() == sentences[0].text.strip():
            return True
        return False


    sentences = [
        "This is an example sentence.",
        "Here is another sentence for training."
    ]

    paragraph = """Financial Aid services are available in person!  We are happy to assist you with your financial aid needs.  If you are interested in visiting the office in person, please review the guidelines for visiting campus and schedule your appointment:

Guidelines for In-Person Financial Aid Services

Due to FERPA regulations, no student information will be given to anyone other than the student without authorization from the student.
We continue to offer virtual services.  Financial Aid staff may be reached by email, phone, text, and zoom!  Please refer to the contact information and schedules below.

Gavilan-WelcomeCenter_Peer_Mentors.jpg

Do you need assistance filing the FAFSA or California Dream Act Application? Friendly and knowledgeable Peer Mentors are available to assist you virtually and in person!  Details below for an online Zoom visit, phone call, or in-person visit with Peer Mentors.

Monday - Friday 8am - 5pm, Student Center
Join Zoom to Connect with a Peer Mentor
Or call (669) 900-6833 and use meeting ID 408 848 4800

MicrosoftTeams-image.png


Do you need assistance with an existing financial aid application, financial aid document submission, or review of your financial aid package? Schedule an in-person, phone, or zoom appointment with our Financial Aid counter.

Mon - Thurs: 9am - 1:00pm, 2:00pm - 5:00pm
Fri: 10am - 2pm
Office: (408) 848-4727     Email: finaid@gavilan.edu
Schedule an In-Person, Phone or Zoom Appointment"""

    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences1 = tokenizer.tokenize(paragraph)
    for i,s in enumerate(sentences1):
        print(i, "\t", s)
    print("\n\n")

    #nlp = spacy.load('en_core_web_sm')
    nlp = spacy.load('en_core_web_md')

    doc = nlp(paragraph)
    sentences2 = list(doc.sents)
    for i,s in enumerate(sentences2):
        t = re.sub(r'\n+',' ',s.text)
        is_sentence = 'yes' if is_complete_sentence(t) else 'no '
        print(i, " ", is_sentence, "  ", t)
    print("\n\n")

    #for text in sentences2:
    #    print(text, "is a complete sentence?" , is_complete_sentence(text))

    return

    tokenized_sentences = [simple_preprocess(s) for s in sentences]
    model = Word2Vec(tokenized_sentences, min_count=1, vector_size=100)

    example_word = "example"
    vector = model.wv[example_word]
    print(f"Vector for the word '{example_word}': {vector}")


## TODO  site scraper

## TODO  finde package that extracts text from web page

### TODO master list of what to index.

from pattern.web import URL, plaintext, extension
from pattern.web import download
from pattern import URL, MIMETYPE_IMAGE
from pattern.web import Crawler
from util import clean_title

save_folder = 'cache/crawl'

class GavCrawl(Crawler):
    def visit(self, link, source=None):
        print 'visited:', repr(link.url), 'from:', link.referrer
        txt = plaintext(link.source)  ## , keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})
        codecs.open(save_folder + '/' + clean_title(link.url) + '.txt').write(txt)

    def fail(self, link):
        print 'failed:', repr(link.url)

def crawl():
    p = GavCrawl(links=['http://www.gavilan.edu/'], delay=3)
    while not p.done:
        p.crawl(method=DEPTH, cached=False, throttle=3)


def samples():
    crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO)

    url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif')
    print url.mimetype in MIMETYPE_IMAGE


    #html = download('http://www.clips.ua.ac.be/', unicode=True)
    s = URL('http://www.clips.ua.ac.be').download()
    s = plaintext(s, keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})


    # getting absolute urls
    from pattern.web import URL, DOM, abs

    url = URL('http://www.clips.ua.ac.be')
    dom = DOM(url.download())
    for link in dom('a'):
        print abs(link.attributes.get('href',''), base=url.redirect or url.string)

    # get pdfs
    from pattern.web import URL, PDF

    url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf')
    pdf = PDF(url.download())
    print pdf.string


if __name__ == "__main__":

    print ('')
    options = { 1: ['download a class into a folder / word file', accessible_check] ,
                2: ['download multiple classes', multiple_downloads ],
                3: ['convert stuff', pan_testing ],
                4: ['convert md to html', md_to_course ],
                # 5: ['import freshdesk content', freshdesk ],
                6: ['download all a courses pages', grab_course_pages],
                7: ['demo vector search', demo_vector_search],
              }

    for key in options:
        print(str(key) + '.\t' + options[key][0])

    print('')
    resp = input('Choose: ')

    # Call the function in the options dict
    options[ int(resp)][1]()