183 lines
7.0 KiB
Python
183 lines
7.0 KiB
Python
# Tools for detecting video embeds, swapping SRT subtitle files, etc
|
|
|
|
|
|
import codecs, re, requests, json, os, webbrowser
|
|
from bs4 import BeautifulSoup as bs
|
|
from util import minimal_string, stripper, mycleaner
|
|
from content import grab_course_pages
|
|
from pipelines import put_file
|
|
|
|
|
|
# Use template to build html page with homegrown subtitles
|
|
def build_srt_embed_php(data):
|
|
template = codecs.open('template_srt_and_video.txt','r','utf-8').readlines()
|
|
result = ''
|
|
for L in template:
|
|
L = re.sub('FRAMEID',data['frameid'],L)
|
|
L = re.sub('TITLE',data['title'],L)
|
|
L = re.sub('EMBEDLINK',data['embedlink'],L)
|
|
L = re.sub('SRTFOLDERFILE',data['srtfolderfile'],L)
|
|
result += L
|
|
return result
|
|
|
|
|
|
|
|
|
|
def yt_title(code):
|
|
global saved_titles
|
|
if code in saved_titles:
|
|
return saved_titles[code]
|
|
a = requests.get('https://www.youtube.com/watch?v=%s' % code)
|
|
bbb = bs(a.content,"lxml")
|
|
ccc = bbb.find('title').text
|
|
ccc = re.sub(r'\s\-\sYouTube','',ccc)
|
|
saved_titles[code] = ccc
|
|
codecs.open('saved_youtube_titles.json','w','utf-8').write(json.dumps(saved_titles))
|
|
return ccc
|
|
|
|
def swap_youtube_subtitles():
|
|
# example here: http://siloor.github.io/youtube.external.subtitle/examples/srt/
|
|
|
|
# srt folder, look at all filenames
|
|
srtlist = os.listdir('video_srt')
|
|
i = 0
|
|
for V in srtlist:
|
|
print(str(i) + '. ' + V)
|
|
i += 1
|
|
choice = input("Which SRT folder? ")
|
|
choice = srtlist[int(choice)]
|
|
srt_folder = 'video_srt/'+choice
|
|
class_srt_folder = choice
|
|
srt_files = os.listdir(srt_folder)
|
|
srt_shorts = {}
|
|
print("\nThese are the subtitle files: " + str(srt_files))
|
|
for V in srt_files:
|
|
if V.endswith('srt'):
|
|
V1 = re.sub(r'(\.\w+$)','',V)
|
|
srt_shorts[V] = minimal_string(V1)
|
|
|
|
crs_id = input("What is the id of the course? ")
|
|
grab_course_pages(crs_id)
|
|
v1_pages = codecs.open('page_revisions/course_'+str(crs_id)+'.html','r','utf-8')
|
|
v1_content = v1_pages.read()
|
|
|
|
# a temporary page of all youtube links
|
|
tp = codecs.open('page_revisions/links_' + str(crs_id) + '.html', 'w','utf-8')
|
|
|
|
# course pages, get them all and look for youtube embeds
|
|
title_shorts = {}
|
|
title_embedlink = {}
|
|
title_list = []
|
|
print("I'm looking for iframes and youtube links.")
|
|
for L in v1_content.split('\n'):
|
|
if re.search('<a.*?href="https:\/\/youtu',L):
|
|
print("Possibly there's a linked video instead of embedded:" + L)
|
|
if re.search('iframe',L):
|
|
ma = re.compile('(\w+)=(".*?")')
|
|
#print "\n"
|
|
this_title = ''
|
|
for g in ma.findall(L):
|
|
print(g)
|
|
if g[0]=='title':
|
|
this_title = g[1].replace('"','')
|
|
if g[0]=='src':
|
|
this_src = g[1].replace('"','')
|
|
#print g
|
|
if not this_title:
|
|
tmp = re.search(r'embed\/(.*?)\?',this_src)
|
|
if not tmp: tmp = re.search(r'embed\/(.*?)$',this_src)
|
|
if tmp:
|
|
this_title = yt_title(tmp.groups()[0])
|
|
title_shorts[this_title] = minimal_string(this_title)
|
|
title_list.append(this_title)
|
|
title_embedlink[this_title] = this_src
|
|
print("%s\n" % this_title.encode('ascii','ignore'))
|
|
tp.write( "%s<br><a target='_blank' href='%s'>%s</a><br /><br />" % (this_title, this_src, this_src) )
|
|
# match them
|
|
# lowercase, non alpha or num chars become a single space, try to match
|
|
# if any srts remain unmatched, ask.
|
|
tp.close()
|
|
webbrowser.open_new_tab('file://C:/SCRIPTS/everything-json/page_revisions/links_'+str(crs_id)+'.html')
|
|
|
|
matches = {} # key is Title, value is srt file
|
|
for S,v in list(srt_shorts.items()):
|
|
found_match = 0
|
|
print(v, end=' ')
|
|
for T, Tv in list(title_shorts.items()):
|
|
if v == Tv:
|
|
print(' \tMatches: ' + T, end=' ')
|
|
found_match = 1
|
|
matches[T] = S
|
|
break
|
|
#print "\n"
|
|
|
|
print("\nThese are the srt files: ")
|
|
print(json.dumps(srt_shorts,indent=2))
|
|
print("\nThese are the titles: ")
|
|
print(json.dumps(title_shorts,indent=2))
|
|
print("\nThese are the matches: ")
|
|
print(json.dumps(matches,indent=2))
|
|
|
|
print(("There are %d SRT files and %d VIDEOS found. " % ( len(list(srt_shorts.keys())), len(list(title_shorts.keys())) ) ))
|
|
|
|
for S,v in list(srt_shorts.items()):
|
|
if not S in list(matches.values()):
|
|
print("\nDidn't find a match for: " + S)
|
|
i = 0
|
|
for T in title_list:
|
|
if not T in list(matches.keys()): print(str(i+1) + ". " + T.encode('ascii', 'ignore'))
|
|
i += 1
|
|
print("Here's the first few lines of the SRT:")
|
|
print(( re.sub(r'\s+',' ', '\n'.join(open(srt_folder+"/"+S,'r').readlines()[0:10]))+"\n\n"))
|
|
choice = input("Which one should I match it to? (zero for no match) ")
|
|
if int(choice)>0:
|
|
matches[ title_list[ int(choice)-1 ] ] = S
|
|
print("SRT clean name was: %s, and TITLE clean name was: %s" % (v,title_shorts[title_list[ int(choice)-1 ]] ))
|
|
print("ok, here are the matches:")
|
|
print(json.dumps(matches,indent=2))
|
|
|
|
# construct subsidiary pages, upload them
|
|
i = 0
|
|
for m,v in list(matches.items()):
|
|
# open template
|
|
# do replacement
|
|
i += 1
|
|
data = {'frameid':'videoframe'+str(i), 'title':m, 'embedlink':title_embedlink[m], 'srtfolderfile':v }
|
|
print(json.dumps(data,indent=2))
|
|
file_part = v.split('.')[0]
|
|
new_php = codecs.open(srt_folder + '/' + file_part + '.php','w','utf-8')
|
|
new_php.write(build_srt_embed_php(data))
|
|
new_php.close()
|
|
#srt_files = os.listdir(srt_folder)
|
|
put_file(class_srt_folder)
|
|
|
|
|
|
def test_swap():
|
|
crs_id = '6923'
|
|
# swap in embed code and re-upload canvas pages
|
|
v2_pages = codecs.open('page_revisions/course_'+str(crs_id)+'.html','r','utf-8')
|
|
v2_content = v2_pages.read()
|
|
ma = re.compile('(\w+)=(".*?")')
|
|
|
|
for L in v2_content.split('\n'):
|
|
find = re.findall('<iframe(.*?)>',L)
|
|
if find:
|
|
print("Found: ", find)
|
|
for each in find:
|
|
#print "\n"
|
|
this_title = ''
|
|
this_src = ''
|
|
for g in ma.findall(each):
|
|
#print g
|
|
if g[0]=='title':
|
|
this_title = g[1].replace('"','')
|
|
if g[0]=='src':
|
|
this_src = g[1].replace('"','')
|
|
#print g
|
|
if not this_title:
|
|
tmp = re.search(r'embed\/(.*?)\?',this_src)
|
|
if not tmp: tmp = re.search(r'embed\/(.*?)$',this_src)
|
|
if tmp:
|
|
this_title = yt_title(tmp.groups()[0])
|
|
print("Found embed link: %s\n and title: %s\n" % (this_src,this_title.encode('ascii','ignore')))
|