canvasapp/video.py

# Tools for detecting video embeds, swapping SRT subtitle files, etc


import codecs, re, requests, json, os, webbrowser
from bs4 import BeautifulSoup as bs
from util import minimal_string, stripper, mycleaner
from content import grab_course_pages
from pipelines import put_file


# Use template to build html page with homegrown subtitles
def build_srt_embed_php(data):
    template = codecs.open('template_srt_and_video.txt','r','utf-8').readlines()
    result = ''
    for L in template:
        L = re.sub('FRAMEID',data['frameid'],L)
        L = re.sub('TITLE',data['title'],L)
        L = re.sub('EMBEDLINK',data['embedlink'],L)
        L = re.sub('SRTFOLDERFILE',data['srtfolderfile'],L)
        result += L
    return result


def yt_title(code):
    global saved_titles
    if code in saved_titles:
        return saved_titles[code]
    a = requests.get('https://www.youtube.com/watch?v=%s' % code)
    bbb = bs(a.content,"lxml")
    ccc = bbb.find('title').text
    ccc = re.sub(r'\s\-\sYouTube','',ccc)
    saved_titles[code] = ccc
    codecs.open('saved_youtube_titles.json','w','utf-8').write(json.dumps(saved_titles))
    return ccc

def swap_youtube_subtitles():
    # example here:  http://siloor.github.io/youtube.external.subtitle/examples/srt/

    # srt folder, look at all filenames
    srtlist = os.listdir('video_srt')
    i = 0
    for V in srtlist:
        print(str(i) + '.  ' + V)
        i += 1
    choice = input("Which SRT folder? ")
    choice = srtlist[int(choice)]
    srt_folder = 'video_srt/'+choice
    class_srt_folder = choice
    srt_files = os.listdir(srt_folder)
    srt_shorts = {}
    print("\nThese are the subtitle files: " + str(srt_files))
    for V in srt_files:
        if V.endswith('srt'):
            V1 = re.sub(r'(\.\w+$)','',V)
            srt_shorts[V] = minimal_string(V1)

    crs_id = input("What is the id of the course?  ")
    grab_course_pages(crs_id)
    v1_pages = codecs.open('page_revisions/course_'+str(crs_id)+'.html','r','utf-8')
    v1_content = v1_pages.read()

    # a temporary page of all youtube links
    tp = codecs.open('page_revisions/links_' + str(crs_id) + '.html', 'w','utf-8')

    # course pages, get them all and look for youtube embeds
    title_shorts = {}
    title_embedlink = {}
    title_list = []
    print("I'm looking for iframes and youtube links.")
    for L in v1_content.split('\n'):
        if re.search('<a.*?href="https:\/\/youtu',L):
            print("Possibly there's a linked video instead of embedded:" + L)
        if re.search('iframe',L):
            ma = re.compile('(\w+)=(".*?")')
            #print "\n"
            this_title = ''
            for g in ma.findall(L):
                print(g)
                if g[0]=='title':
                    this_title = g[1].replace('"','')
                if g[0]=='src':
                    this_src = g[1].replace('"','')
                #print g
            if not this_title:
                tmp = re.search(r'embed\/(.*?)\?',this_src)
                if not tmp: tmp = re.search(r'embed\/(.*?)$',this_src)
                if tmp:
                    this_title = yt_title(tmp.groups()[0])
            title_shorts[this_title] = minimal_string(this_title)
            title_list.append(this_title)
            title_embedlink[this_title] = this_src
            print("%s\n" % this_title.encode('ascii','ignore'))
            tp.write(  "%s<br><a target='_blank' href='%s'>%s</a><br /><br />" % (this_title, this_src, this_src) )
    # match them
    # lowercase, non alpha or num chars become a single space, try to match
    # if any srts remain unmatched, ask.
    tp.close()
    webbrowser.open_new_tab('file://C:/SCRIPTS/everything-json/page_revisions/links_'+str(crs_id)+'.html')

    matches = {}                    # key is Title, value is srt file
    for S,v in list(srt_shorts.items()):
        found_match = 0
        print(v, end=' ')
        for T, Tv in list(title_shorts.items()):
            if v == Tv:
                print(' \tMatches: ' + T, end=' ')
                found_match = 1
                matches[T] = S
                break
        #print "\n"

    print("\nThese are the srt files: ")
    print(json.dumps(srt_shorts,indent=2))
    print("\nThese are the titles: ")
    print(json.dumps(title_shorts,indent=2))
    print("\nThese are the matches: ")
    print(json.dumps(matches,indent=2))

    print(("There are %d SRT files and %d VIDEOS found. " % ( len(list(srt_shorts.keys())), len(list(title_shorts.keys())) ) ))

    for S,v in list(srt_shorts.items()):
        if not S in list(matches.values()):
            print("\nDidn't find a match for: " + S)
            i = 0
            for T in title_list:
                if not T in list(matches.keys()): print(str(i+1) + ". " + T.encode('ascii', 'ignore'))
                i += 1
            print("Here's the first few lines of the SRT:")
            print((  re.sub(r'\s+',' ', '\n'.join(open(srt_folder+"/"+S,'r').readlines()[0:10]))+"\n\n"))
            choice = input("Which one should I match it to? (zero for no match)  ")
            if int(choice)>0:
                matches[ title_list[ int(choice)-1 ] ] = S
                print("SRT clean name was: %s, and TITLE clean name was: %s" % (v,title_shorts[title_list[ int(choice)-1 ]] ))
    print("ok, here are the matches:")
    print(json.dumps(matches,indent=2))

    # construct subsidiary pages, upload them
    i = 0
    for m,v in list(matches.items()):
        # open template
        # do replacement
        i += 1
        data = {'frameid':'videoframe'+str(i), 'title':m, 'embedlink':title_embedlink[m], 'srtfolderfile':v  }
        print(json.dumps(data,indent=2))
        file_part = v.split('.')[0]
        new_php = codecs.open(srt_folder + '/' + file_part + '.php','w','utf-8')
        new_php.write(build_srt_embed_php(data))
        new_php.close()
    #srt_files = os.listdir(srt_folder)
    put_file(class_srt_folder)


def test_swap():
    crs_id = '6923'
    # swap in embed code and re-upload canvas pages
    v2_pages = codecs.open('page_revisions/course_'+str(crs_id)+'.html','r','utf-8')
    v2_content = v2_pages.read()
    ma = re.compile('(\w+)=(".*?")')

    for L in v2_content.split('\n'):
        find = re.findall('<iframe(.*?)>',L)
        if find:
            print("Found: ", find)
            for each in find:
                #print "\n"
                this_title = ''
                this_src = ''
                for g in ma.findall(each):
                    #print g
                    if g[0]=='title':
                        this_title = g[1].replace('"','')
                    if g[0]=='src':
                        this_src = g[1].replace('"','')
                    #print g
                if not this_title:
                    tmp = re.search(r'embed\/(.*?)\?',this_src)
                    if not tmp: tmp = re.search(r'embed\/(.*?)$',this_src)
                    if tmp:
                        this_title = yt_title(tmp.groups()[0])
                print("Found embed link: %s\n and title: %s\n" % (this_src,this_title.encode('ascii','ignore')))