# Tools for detecting video embeds, swapping SRT subtitle files, etc import codecs, re, requests, json, os, webbrowser from bs4 import BeautifulSoup as bs from util import minimal_string, stripper, mycleaner from content import grab_course_pages from pipelines import put_file # Use template to build html page with homegrown subtitles def build_srt_embed_php(data): template = codecs.open('template_srt_and_video.txt','r','utf-8').readlines() result = '' for L in template: L = re.sub('FRAMEID',data['frameid'],L) L = re.sub('TITLE',data['title'],L) L = re.sub('EMBEDLINK',data['embedlink'],L) L = re.sub('SRTFOLDERFILE',data['srtfolderfile'],L) result += L return result def yt_title(code): global saved_titles if code in saved_titles: return saved_titles[code] a = requests.get('https://www.youtube.com/watch?v=%s' % code) bbb = bs(a.content,"lxml") ccc = bbb.find('title').text ccc = re.sub(r'\s\-\sYouTube','',ccc) saved_titles[code] = ccc codecs.open('saved_youtube_titles.json','w','utf-8').write(json.dumps(saved_titles)) return ccc def swap_youtube_subtitles(): # example here: http://siloor.github.io/youtube.external.subtitle/examples/srt/ # srt folder, look at all filenames srtlist = os.listdir('video_srt') i = 0 for V in srtlist: print(str(i) + '. ' + V) i += 1 choice = input("Which SRT folder? ") choice = srtlist[int(choice)] srt_folder = 'video_srt/'+choice class_srt_folder = choice srt_files = os.listdir(srt_folder) srt_shorts = {} print("\nThese are the subtitle files: " + str(srt_files)) for V in srt_files: if V.endswith('srt'): V1 = re.sub(r'(\.\w+$)','',V) srt_shorts[V] = minimal_string(V1) crs_id = input("What is the id of the course? ") grab_course_pages(crs_id) v1_pages = codecs.open('page_revisions/course_'+str(crs_id)+'.html','r','utf-8') v1_content = v1_pages.read() # a temporary page of all youtube links tp = codecs.open('page_revisions/links_' + str(crs_id) + '.html', 'w','utf-8') # course pages, get them all and look for youtube embeds title_shorts = {} title_embedlink = {} title_list = [] print("I'm looking for iframes and youtube links.") for L in v1_content.split('\n'): if re.search('%s

" % (this_title, this_src, this_src) ) # match them # lowercase, non alpha or num chars become a single space, try to match # if any srts remain unmatched, ask. tp.close() webbrowser.open_new_tab('file://C:/SCRIPTS/everything-json/page_revisions/links_'+str(crs_id)+'.html') matches = {} # key is Title, value is srt file for S,v in list(srt_shorts.items()): found_match = 0 print(v, end=' ') for T, Tv in list(title_shorts.items()): if v == Tv: print(' \tMatches: ' + T, end=' ') found_match = 1 matches[T] = S break #print "\n" print("\nThese are the srt files: ") print(json.dumps(srt_shorts,indent=2)) print("\nThese are the titles: ") print(json.dumps(title_shorts,indent=2)) print("\nThese are the matches: ") print(json.dumps(matches,indent=2)) print(("There are %d SRT files and %d VIDEOS found. " % ( len(list(srt_shorts.keys())), len(list(title_shorts.keys())) ) )) for S,v in list(srt_shorts.items()): if not S in list(matches.values()): print("\nDidn't find a match for: " + S) i = 0 for T in title_list: if not T in list(matches.keys()): print(str(i+1) + ". " + T.encode('ascii', 'ignore')) i += 1 print("Here's the first few lines of the SRT:") print(( re.sub(r'\s+',' ', '\n'.join(open(srt_folder+"/"+S,'r').readlines()[0:10]))+"\n\n")) choice = input("Which one should I match it to? (zero for no match) ") if int(choice)>0: matches[ title_list[ int(choice)-1 ] ] = S print("SRT clean name was: %s, and TITLE clean name was: %s" % (v,title_shorts[title_list[ int(choice)-1 ]] )) print("ok, here are the matches:") print(json.dumps(matches,indent=2)) # construct subsidiary pages, upload them i = 0 for m,v in list(matches.items()): # open template # do replacement i += 1 data = {'frameid':'videoframe'+str(i), 'title':m, 'embedlink':title_embedlink[m], 'srtfolderfile':v } print(json.dumps(data,indent=2)) file_part = v.split('.')[0] new_php = codecs.open(srt_folder + '/' + file_part + '.php','w','utf-8') new_php.write(build_srt_embed_php(data)) new_php.close() #srt_files = os.listdir(srt_folder) put_file(class_srt_folder) def test_swap(): crs_id = '6923' # swap in embed code and re-upload canvas pages v2_pages = codecs.open('page_revisions/course_'+str(crs_id)+'.html','r','utf-8') v2_content = v2_pages.read() ma = re.compile('(\w+)=(".*?")') for L in v2_content.split('\n'): find = re.findall('',L) if find: print("Found: ", find) for each in find: #print "\n" this_title = '' this_src = '' for g in ma.findall(each): #print g if g[0]=='title': this_title = g[1].replace('"','') if g[0]=='src': this_src = g[1].replace('"','') #print g if not this_title: tmp = re.search(r'embed\/(.*?)\?',this_src) if not tmp: tmp = re.search(r'embed\/(.*?)$',this_src) if tmp: this_title = yt_title(tmp.groups()[0]) print("Found embed link: %s\n and title: %s\n" % (this_src,this_title.encode('ascii','ignore')))