canvasapp/video.py

183 lines
7.0 KiB
Python

# Tools for detecting video embeds, swapping SRT subtitle files, etc
import codecs, re, requests, json, os, webbrowser
from bs4 import BeautifulSoup as bs
from util import minimal_string, stripper, mycleaner
from content import grab_course_pages
from pipelines import put_file
# Use template to build html page with homegrown subtitles
def build_srt_embed_php(data):
template = codecs.open('template_srt_and_video.txt','r','utf-8').readlines()
result = ''
for L in template:
L = re.sub('FRAMEID',data['frameid'],L)
L = re.sub('TITLE',data['title'],L)
L = re.sub('EMBEDLINK',data['embedlink'],L)
L = re.sub('SRTFOLDERFILE',data['srtfolderfile'],L)
result += L
return result
def yt_title(code):
global saved_titles
if code in saved_titles:
return saved_titles[code]
a = requests.get('https://www.youtube.com/watch?v=%s' % code)
bbb = bs(a.content,"lxml")
ccc = bbb.find('title').text
ccc = re.sub(r'\s\-\sYouTube','',ccc)
saved_titles[code] = ccc
codecs.open('saved_youtube_titles.json','w','utf-8').write(json.dumps(saved_titles))
return ccc
def swap_youtube_subtitles():
# example here: http://siloor.github.io/youtube.external.subtitle/examples/srt/
# srt folder, look at all filenames
srtlist = os.listdir('video_srt')
i = 0
for V in srtlist:
print(str(i) + '. ' + V)
i += 1
choice = input("Which SRT folder? ")
choice = srtlist[int(choice)]
srt_folder = 'video_srt/'+choice
class_srt_folder = choice
srt_files = os.listdir(srt_folder)
srt_shorts = {}
print("\nThese are the subtitle files: " + str(srt_files))
for V in srt_files:
if V.endswith('srt'):
V1 = re.sub(r'(\.\w+$)','',V)
srt_shorts[V] = minimal_string(V1)
crs_id = input("What is the id of the course? ")
grab_course_pages(crs_id)
v1_pages = codecs.open('page_revisions/course_'+str(crs_id)+'.html','r','utf-8')
v1_content = v1_pages.read()
# a temporary page of all youtube links
tp = codecs.open('page_revisions/links_' + str(crs_id) + '.html', 'w','utf-8')
# course pages, get them all and look for youtube embeds
title_shorts = {}
title_embedlink = {}
title_list = []
print("I'm looking for iframes and youtube links.")
for L in v1_content.split('\n'):
if re.search('<a.*?href="https:\/\/youtu',L):
print("Possibly there's a linked video instead of embedded:" + L)
if re.search('iframe',L):
ma = re.compile('(\w+)=(".*?")')
#print "\n"
this_title = ''
for g in ma.findall(L):
print(g)
if g[0]=='title':
this_title = g[1].replace('"','')
if g[0]=='src':
this_src = g[1].replace('"','')
#print g
if not this_title:
tmp = re.search(r'embed\/(.*?)\?',this_src)
if not tmp: tmp = re.search(r'embed\/(.*?)$',this_src)
if tmp:
this_title = yt_title(tmp.groups()[0])
title_shorts[this_title] = minimal_string(this_title)
title_list.append(this_title)
title_embedlink[this_title] = this_src
print("%s\n" % this_title.encode('ascii','ignore'))
tp.write( "%s<br><a target='_blank' href='%s'>%s</a><br /><br />" % (this_title, this_src, this_src) )
# match them
# lowercase, non alpha or num chars become a single space, try to match
# if any srts remain unmatched, ask.
tp.close()
webbrowser.open_new_tab('file://C:/SCRIPTS/everything-json/page_revisions/links_'+str(crs_id)+'.html')
matches = {} # key is Title, value is srt file
for S,v in list(srt_shorts.items()):
found_match = 0
print(v, end=' ')
for T, Tv in list(title_shorts.items()):
if v == Tv:
print(' \tMatches: ' + T, end=' ')
found_match = 1
matches[T] = S
break
#print "\n"
print("\nThese are the srt files: ")
print(json.dumps(srt_shorts,indent=2))
print("\nThese are the titles: ")
print(json.dumps(title_shorts,indent=2))
print("\nThese are the matches: ")
print(json.dumps(matches,indent=2))
print(("There are %d SRT files and %d VIDEOS found. " % ( len(list(srt_shorts.keys())), len(list(title_shorts.keys())) ) ))
for S,v in list(srt_shorts.items()):
if not S in list(matches.values()):
print("\nDidn't find a match for: " + S)
i = 0
for T in title_list:
if not T in list(matches.keys()): print(str(i+1) + ". " + T.encode('ascii', 'ignore'))
i += 1
print("Here's the first few lines of the SRT:")
print(( re.sub(r'\s+',' ', '\n'.join(open(srt_folder+"/"+S,'r').readlines()[0:10]))+"\n\n"))
choice = input("Which one should I match it to? (zero for no match) ")
if int(choice)>0:
matches[ title_list[ int(choice)-1 ] ] = S
print("SRT clean name was: %s, and TITLE clean name was: %s" % (v,title_shorts[title_list[ int(choice)-1 ]] ))
print("ok, here are the matches:")
print(json.dumps(matches,indent=2))
# construct subsidiary pages, upload them
i = 0
for m,v in list(matches.items()):
# open template
# do replacement
i += 1
data = {'frameid':'videoframe'+str(i), 'title':m, 'embedlink':title_embedlink[m], 'srtfolderfile':v }
print(json.dumps(data,indent=2))
file_part = v.split('.')[0]
new_php = codecs.open(srt_folder + '/' + file_part + '.php','w','utf-8')
new_php.write(build_srt_embed_php(data))
new_php.close()
#srt_files = os.listdir(srt_folder)
put_file(class_srt_folder)
def test_swap():
crs_id = '6923'
# swap in embed code and re-upload canvas pages
v2_pages = codecs.open('page_revisions/course_'+str(crs_id)+'.html','r','utf-8')
v2_content = v2_pages.read()
ma = re.compile('(\w+)=(".*?")')
for L in v2_content.split('\n'):
find = re.findall('<iframe(.*?)>',L)
if find:
print("Found: ", find)
for each in find:
#print "\n"
this_title = ''
this_src = ''
for g in ma.findall(each):
#print g
if g[0]=='title':
this_title = g[1].replace('"','')
if g[0]=='src':
this_src = g[1].replace('"','')
#print g
if not this_title:
tmp = re.search(r'embed\/(.*?)\?',this_src)
if not tmp: tmp = re.search(r'embed\/(.*?)$',this_src)
if tmp:
this_title = yt_title(tmp.groups()[0])
print("Found embed link: %s\n and title: %s\n" % (this_src,this_title.encode('ascii','ignore')))