canvasapp/depricated.py

1902 lines
64 KiB
Python

#get_schedule('201770')
# from pipelines - canvas data
# todo: where does the most recent schedule come from?
# Input: xxxx_sched.json. Output: xxxx_latestarts.txt
def list_latestarts():
#term = input("Name of current semester file? (ex: sp18) ")
term = "sp23" # sems[0]
term_in = "cache/" + term + "_sched.json"
term_out = "cache/" + term + "_latestarts.txt"
print("Writing output to " + term_out)
infile = open(term_in, "r")
outfile = open(term_out, "w")
sched = json.loads(infile.read())
#print sched
by_date = {}
for C in sched:
parts = C['date'].split("-")
start = parts[0]
codes = C['code'].split(' ')
dept = codes[0]
if dept in ['JLE','JFT','CWE']:
continue
if re.search('TBA',start): continue
try:
startd = parser.parse(start)
except Exception as e:
print(e, "\nproblem parsing ", start)
#print startd
if not startd in by_date:
by_date[startd] = []
by_date[startd].append(C)
for X in sorted(by_date.keys()):
#print "Start: " + str(X)
if len(by_date[X]) < 200:
prettydate = X.strftime("%A, %B %d")
print(prettydate + ": " + str(len(by_date[X])) + " courses")
outfile.write(prettydate + ": " + str(len(by_date[X])) + " courses" + "\n")
for Y in by_date[X]:
#print "\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher']
print(Y)
#outfile.write("\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher'] + "\t" + Y['type'] +"\n")
outfile.write("\t" + Y['code'] + " " + Y['crn'] + "\t" + Y['teacher'] + "\t" + Y['type'] + "\t" + "\n")
online_courses = {}
def prep_online_courses_df():
global online_courses
schedule = current_schedule() # from banner
online_courses = schedule[lambda x: x.type=='online']
def course_is_online(crn):
global online_courses
#print "looking up: " + str(crn)
#print online_courses
course = online_courses[lambda x: x.crn==int(crn)]
return len(course)
def get_crn_from_name(name):
#print "name is: "
#print(name)
m = re.search( r'(\d\d\d\d\d)', name)
if m: return int(m.groups(1)[0])
else: return 0
def get_enrlmts_for_user(user,enrollments):
#active enrollments
u_en = enrollments[ lambda x: (x['user_id'] == user) & (x['workflow']=='active') ]
return u_en[['type','course_id']]
"""
timestamp = nowAsStr()
requestParts = [ method,
host,
'', #content Type Header
'', #content MD5 Header
path,
'', #alpha-sorted Query Params
timestamp,
apiSecret ]
#Build the request
requestMessage = '\n'.join( requestParts )
requestMessage = requestMessage.encode('ASCII')
print((requestMessage.__repr__()))
hmacObject = hmac.new(bytearray(apiSecret,'ASCII'), bytearray('','ASCII'), hashlib.sha256) #
hmacObject.update(requestMessage)
hmac_digest = hmacObject.digest()
sig = base64.b64encode(hmac_digest)
headerDict = {
'Authorization' : 'HMACAuth ' + apiKey + ':' + str(sig),
'Date' : timestamp
}
"""
# Don't know
def demo():
resp = do_request('/api/account/self/file/sync')
mylog.write(json.dumps(resp, indent=4))
sample_table = resp['files'][10]
filename = sample_table['filename']
print(sample_table['table'])
response = requests.request(method='GET', url=sample_table['url'], stream=True)
if(response.status_code != 200):
print(('Request response went bad. Got back a ', response.status_code, ' code, meaning the request was ', response.reason))
else:
#Use the downloaded data
with open(local_data_folder + filename, 'wb') as fd:
for chunk in response.iter_content(chunk_size=128):
fd.write(chunk)
print("Success")
if filename.split('.')[-1] == 'gz':
plain_filename = 'canvas_data/' + ".".join(filename.split('.')[:-1])
pf = open(plain_filename,'w')
with gzip.open('canvas_data/' + filename , 'rb') as f:
pf.write(f.read())
# How to drop columns
#columns = ['Col1', 'Col2', ...]
#df.drop(columns, inplace=True, axis=1)
# left join, one on column, one on index
#merged = pd.merge(result,users,left_index=True,right_on='id', how='left')
"""
You can call set_index on the result of the dataframe:
In [2]:
data=[['Australia',100],['France',200],['Germany',300],['America',400]]
pd.DataFrame(data,columns=['Country','Volume']).set_index('Country')
Out[2]:
Volume
Country
Australia 100
France 200
Germany 300
America 400
"""
def stats():
# nothing seems to happen here?
#input = csv.DictReader(codecs.open(schedfile,'r','utf-8'))
input = csv.DictReader(open(schedfile,'r'))
out2 = open('temp2.csv','w')
clean = {}
for r in input:
if r['crn']: clean[ r['crn'] ] = r
for c,r in list(clean.items()):
try:
if int(r['cap'])==0: continue
else: prct = (1.0 * int( r['act'] )) / int(r['cap'])
if prct < 0.01: continue
o_str = ''
if r['location'].strip()=='ONLINE': o_str = 'online'
#print r['location']
date_parts = r['date'].split('-')
start = strptime(date_parts[0], '%m/%d')
if start > semester_begin: o_str += "\tlatestart " + date_parts[0]
out2.write( "".join([c, "\t", r['sub'], "\t", r['crs'], "\t", str(round(prct,2)), "% full\t", o_str, "\n"]) )
except:
pass
######### from curriculum. py
# open('cache/programs/programs_1.txt','r').read()
"""
SEE serve.py .... i mean ... interactive.py
def dict_generator(indict, pre=None):
pre = pre[:] if pre else []
if isinstance(indict, dict):
for key, value in indict.items():
if isinstance(value, dict):
for d in dict_generator(value, pre + [key]):
yield d
elif isinstance(value, list) or isinstance(value, tuple):
for v in value:
for d in dict_generator(v, pre + [key]):
yield d
else:
yield str(pre) + " " + str([key, value]) + "\n"
else:
yield pre + [indict]
yield str(pre) + " " + str([indict]) + "\n"
def print_dict(v, prefix='',indent=''):
if isinstance(v, dict):
return [ print_dict(v2, "{}['{}']".format(prefix, k) + "<br />", indent+" " ) for k, v2 in v.items() ]
elif isinstance(v, list):
return [ print_dict( v2, "{}[{}]".format(prefix , i) + "<br />", indent+" ") for i, v2 in enumerate(v) ]
else:
return '{} = {}'.format(prefix, repr(v)) + "\n"
def walk_file():
j = json.loads(open('cache/programs/programs_2.txt','r').read())
return print_dict(j)
from flask import Flask
from flask import request
def tag(x,y): return "<%s>%s</%s>" % (x,y,x)
def tagc(x,c,y): return '<%s class="%s">%s</%s>' % (x,c,y,x)
def a(t,h): return '<a href="%s">%s</a>' % (h,t)
def server_save(key,value):
codecs.open('cache/server_data.txt','a').write( "%s=%s\n" % (str(key),str(value)))
def flask_thread(q):
app = Flask(__name__)
@app.route("/")
def home():
return tag('h1','This is my server.') + "<br />" + a('want to shut down?','/sd')
@app.route("/save/<key>/<val>")
def s(key,val):
server_save(key,val)
return tag('h1','Saved.') + "<br />" + tag('p', 'Saved: %s = %s' % (str(key),str(val)))
@app.route("/crazy")
def hello():
r = '<link rel="stylesheet" href="static/bootstrap.min.css">'
r += tag('style', 'textarea { white-space:nowrap; }')
r += tag('body', \
tagc('div','container-fluid', \
tagc('div','row', \
tagc( 'div', 'col-md-6', tag('pre', walk_file() ) ) + \
tagc( 'div', 'col-md-6', 'Column 2' + a('Shut Down','/shutdown' ) ) ) ) )
return r
@app.route("/sd")
def sd():
print('SIGINT or CTRL-C detected. Exiting gracefully')
func = request.environ.get('werkzeug.server.shutdown')
if func is None:
raise RuntimeError('Not running with the Werkzeug Server')
func()
return "Server has shut down."
app.run()
from queue import Queue
q = Queue()
def serve():
import webbrowser
import threading
x = threading.Thread(target=flask_thread, args=(q,))
x.start()
webbrowser.open_new_tab("http://localhost:5000")
#s = open('cache/programs/index.json','w')
#s.write( json.dumps({'departments':sorted(list(dept_index)), 'programs':prog_index}, indent=2) )
#s.close()
"""
### courses.py
##########
########## CALCULATING SEMESTER STUFF
##########
def summarize_proportion_online_classes(u):
# u is a "group" from the groupby fxn
#print u
if NUM_ONLY:
if ((1.0 * u.sum()) / u.size) > 0.85: return '2'
if ((1.0 * u.sum()) / u.size) < 0.15: return '0'
return '1'
else:
if ((1.0 * u.sum()) / u.size) > 0.85: return 'online-only'
if ((1.0 * u.sum()) / u.size) < 0.15: return 'f2f-only'
return 'mixed'
def summarize_num_term_classes(u):
# u is a "group" from the groupby fxn
# term is sp18 now
#print u
return u.size
# Prompt for course id, return list of user dicts. TODO this duplicates courses.py ??
def getUsersInCourse(id=0): # returns list
if not id:
id = str(input("The Course ID? "))
id = str(id)
return fetch('/api/v1/courses/%s/users' % id, 0)
#### curriculum.py
def recur_look_for_leafs(item,indent=0,show=1):
global leafcount, displaynames
ii = indent * " "
is_leaf = am_i_a_leaf(item)
if type(item) == type({}):
status = ""
if show:
status = "Dict"
if is_leaf:
leafcount += 1
status = "Leaf Dict"
if status:
print("\n%s%s" % (ii,status))
indent += 1
ii = indent * " "
for K,V in list(item.items()):
if show or is_leaf:
print("%s%s:" % (ii, K), end="")
if K =='displayName': displaynames.append(V)
recur_look_for_leafs(V,indent+1,show or is_leaf)
elif type(item) == type([]):
status = ""
if show: status = "List (" + str( len(item) ) + ")"
if is_leaf: status = "Leaf List (" + str( len(item) ) + ")"
if status:
print("\n%s%s" % (ii,status))
indent += 1
ii = indent * " "
for V in item:
recur_look_for_leafs(V,indent+1, show or is_leaf)
elif type(item) == type("abc"):
if show: print("%s%s" % (' ', item))
elif type(item) == type(55):
if show: print("%s%i" % (' ', item))
elif type(item) == type(5.5):
if show: print("%s%f" % (' ', item))
elif type(item) == type(False):
if show: print("%s%s" % (' ', str(item)))
def am_i_a_leaf(item):
if type(item) == type({}):
for K,V in list(item.items()):
if type(V) == type({}) or type(V) == type([]):
return False
elif type(item) == type([]):
for V in item:
if type(V) == type({}) or type(V) == type([]):
return False
elif type(item) == type("abc"): return True
elif type(item) == type(55): return True
elif type(item) == type(5.5): return True
elif type(item) == type(False):
if item == False: return True
elif item == True: return True
return True
def sampleclass():
theclass = json.loads( codecs.open('cache/courses/samplecourse.json','r','utf-8').read() )
#print(json.dumps(theclass,indent=2))
recur_look_for_leafs(theclass)
print(leafcount)
print(sorted(displaynames))
def matchstyle():
theclass = json.loads( codecs.open('cache/courses/samplecourse.json','r','utf-8').read() )
print("\n".join(recur_matcher(theclass)))
# 7: ['pattern matcher style', matchstyle],
# 8: ['pattern matcher - test on all classes', match_style_test],
##### from localcache
def user_role_and_online():
# cross list users, classes enrolled, and their roles
global role_table, term_courses
role_table = enrollment_file()
user_table = users_file()
user_table = user_table[ user_table['name']!="Test Student" ]
term_table = term_file()
current = term_table[lambda d: d.course_section=='2020 Spring'] # current semester from canvas
term_id = current['id'].values[0]
course_table = courses_file() # from canvas
schedule = current_schedule() # from banner...
term_courses = course_table[lambda d: d.termid==term_id] # courses this semester ... now add a crn column
term_courses['crn'] = term_courses['code'].map( lambda x: get_crn_from_name(x) )
# add is_online flag (for courses listed in schedule as online-only)
term_courses['is_online'] = term_courses['crn'].map( lambda x: course_is_online( x ) ) # kinda redundant
ban_can = term_courses.merge(schedule,on='crn',how='left') #join the schedule from banner to the courses from canvas
role_table = role_table.where(lambda x: x.workflow=='active')
# this join limits to current semester if 'inner', or all semesters if 'left'
courses_and_enrol = role_table.merge(ban_can,left_on='course_id',right_on='id', how='left')
user_table = user_table.drop(columns="rootactid tz created vis school position gender locale public bd cc state".split(" "))
c_e_user = courses_and_enrol.merge(user_table,left_on='user_id',right_on='id',how='left')
prop_online = pd.DataFrame(c_e_user.groupby(['user_id'])['is_online'].aggregate(summarize_proportion_online_classes).rename('proportion_online'))
num_trm_crs = pd.DataFrame(c_e_user.groupby(['user_id'])['is_online'].aggregate(summarize_num_term_classes).rename('num_term_crs'))
stu_tch_rol = pd.DataFrame(c_e_user.groupby(['user_id'])['type'].aggregate(summarize_student_teacher_role).rename('main_role'))
user_table = user_table.merge(prop_online,left_on='id',right_index=True)
user_table = user_table.merge(num_trm_crs,left_on='id',right_index=True)
user_table = user_table.merge(stu_tch_rol,left_on='id',right_index=True)
# remove name-less entries
user_table = user_table.where(lambda x: (x.canvasid!='') ) # math.isnan(x.canvasid))
return user_table
#print user_table.query('proportion_online=="online-only"')
#print user_table.query('main_role=="teacher"')
#user_table.to_csv('canvas_data/users_online.csv')
"""e_qry = "CREATE TABLE IF NOT EXISTS enrollments (
id integer PRIMARY KEY,
name text NOT NULL,
begin_date text,
end_date text
);"""
"""
['CREATE INDEX "idx_req_userid" ON "requests" ("id","courseid","userid" );',
'CREATE INDEX "idx_users_id" ON "users" ("id","canvasid", );',
'CREATE INDEX "idx_term_id" ON "terms" ("id","canvasid" );',
'CREATE INDEX "idx_enrollment" ON "enrollment" ("cid","course_id","user_id" );',
'CREATE INDEX "idx_courses" ON "courses" ("id","canvasid","termid","code","name" );' ]
took 6 seconds
select * from users where name = "Peter Howell"
select * from users join requests on users.id = requests.userid where name = "Peter Howell"
20k rows in 1.014 seconds!! with index above
without: killed it after 120 seconds
select timestamp, url, useragent, httpmethod, remoteip, controller from users join requests on users.id = requests.userid where name = "Peter Howell" order by requests.timestamp
select courses.name, courses.code, terms.name, requests.url from courses
join terms on courses.termid = terms.id
join requests on courses.id = requests.courseid
where terms.name='2020 Spring ' and courses.code='ACCT20 SP20 40039'
order by courses.code
"""
def more_unused_xreferencing():
"""continue
for line in lines:
r = requests_line(line.decode('utf-8'),filei)
if filei < 5:
print(r)
else:
break
filei += 1
by_date_course = defaultdict( lambda: defaultdict(int) )
by_date_user = defaultdict( lambda: defaultdict(int) )
df_list = []
df_list_crs = []
users = defaultdict( lambda: defaultdict(int) )
#by_user = {}
#by_course = {}
i = 0
limit = 300
#print(r)
date = dt.strptime( r['timestamp'], "%Y-%m-%d %H:%M:%S.%f" )
if r['userid'] in users:
users[r['userid']]['freq'] += 1
if users[r['userid']]['lastseen'] < date:
users[r['userid']]['lastseen'] = date
else:
users[r['userid']] = {"id":r['userid'], "lastseen":date, "freq":1}
by_date_course[ r['day'] ][ r['courseid'] ] += 1
by_date_user[ r['day'] ][ r['userid'] ] += 1
#if r['userid'] in by_user: by_user[r['userid']] += 1
#else: by_user[r['userid']] = 1
#if r['courseid'] in by_course: by_course[r['courseid']] += 1
#else: by_course[r['courseid']] = 1
#mylog.write("by_user = " + str(by_user))
df_list.append(pd.DataFrame(data=by_date_user))
df_list_crs.append(pd.DataFrame(data=by_date_course))
i += 1
if i > limit: break
#mylog.write("by_date_course = ")
result = pd.concat(df_list, axis=1,join='outer')
result_crs = pd.concat(df_list_crs, axis=1,join='outer')
#print result_crs
mylog.write(result.to_csv())
# get users
usersf = user_role_and_online()
merged = pd.merge(result,usersf,left_index=True,right_on='id', how='left')
#dropkeys = "rootactid tz created vis school position gender locale public bd cc state".split(" ")
#merged.drop(dropkeys, inplace=True, axis=1)
mglog = open(local_data_folder+'userlogs.csv','w')
mglog.write(merged.to_csv())
# get courses
courses = courses_file()
merged2 = pd.merge(result_crs,courses,left_index=True,right_on='id', how='left')
dropkeys = "rootactid wikiid".split(" ")
merged2.drop(dropkeys, inplace=True, axis=1)
mglogc = open(local_data_folder + 'courselogs.csv','w')
mglogc.write(merged2.to_csv())
# a users / freq / lastseen file
ufl = open(local_data_folder + "user_freq.json","w")
today = datetime.datetime.today()
for U in list(users.keys()):
date = users[U]['lastseen']
users[U]['lastseen'] = date.strftime("%Y-%m-%d")
diff = today - date
users[U]['daysago'] = str(diff.days)
users[U]['hoursago'] = str(int(diff.total_seconds()/3600))
us_frame = pd.DataFrame.from_dict(users,orient='index')
us_with_names = pd.merge(us_frame,usersf,left_index=True,right_on='id', how='left')
#dropkeys = "id id_x id_y globalid rootactid tz created vis school position gender locale public bd cc state".split(" ")
#us_with_names.drop(dropkeys, inplace=True, axis=1)
print(us_with_names)
ufl.write( json.dumps(users, indent=4) )
ufl.close()
mglogd = open('canvas_data/user_freq.csv','w')
mglogd.write(us_with_names.to_csv())
"""
""" -- projects table
CREATE TABLE IF NOT EXISTS projects (
id integer PRIMARY KEY,
name text NOT NULL,
begin_date text,
end_date text
);
"""
pass
def users_p_file():
uf = users_file()
pf = pseudonym_file()
#print pf
upf = uf.merge(pf,left_on='id',right_on='user_id',how='left')
return upf
"""
def com_channel_dim():
all = os.listdir(local_data_folder)
all.sort(key=lambda x: os.stat(os.path.join(local_data_folder,x)).st_mtime)
all.reverse()
#print "sorted file list:"
#print all
for F in all:
if re.search('communication_channel_dim',F):
cc_file = F
break
print("most recent communication channel file is " + cc_file)
cc_users = []
for line in gzip.open(local_data_folder + cc_file,'r'):
line_dict = dict(list(zip(cc_format, line.split("\t"))))
#line_dict['globalid'] = line_dict['globalid'].rstrip()
cc_users.append(line_dict)
df = pd.DataFrame(cc_users)
return df
"""
"""grp_sum_qry = ""SELECT u.sortablename, r.timeblock, SUM(r.viewcount), u.canvasid AS user, c.canvasid AS course
FROM requests_sum1 AS r
JOIN courses AS c ON e.course_id=c.id
JOIN enrollment as e ON r.courseid=c.id
JOIN users AS u ON u.id=e.user_id
WHERE c.canvasid=%s AND e."type"="StudentEnrollment"
GROUP BY u.id,c.id,r.timeblock
ORDER BY u.sortablename DESC, r.timeblock"" % course_id
q = ""SELECT u.sortablename, r.timeblock, r.viewcount, u.canvasid AS user, c.canvasid AS course
FROM requests_sum1 AS r
JOIN courses AS c ON e.course_id=c.id
JOIN enrollment as e ON r.courseid=c.id
JOIN users AS u ON u.id=e.user_id
WHERE c.canvasid=%s AND e."type"="StudentEnrollment" AND u.canvasid=810
ORDER BY u.sortablename DESC, r.timeblock"" % course_id
q = ""SELECT u.sortablename, r.timeblock, r.viewcount, u.canvasid AS user, c.canvasid AS course FROM enrollment as e JOIN courses AS c ON e.course_id=c.id
JOIN requests_sum1 AS r ON r.courseid=c.id
JOIN users AS u ON u.id=e.user_id
WHERE c.canvasid=%s AND e."type"="StudentEnrollment"
ORDER BY u.sortablename, r.timeblock"" % course_id"""
stem_course_id = '11015' # TODO
# NO LONGER USED - SEE COURSES
def enroll_stem_students():
depts = "MATH BIO CHEM PHYS ASTR GEOG".split(" ")
students = set()
for d in depts:
students.update(dept_classes(d))
print(students)
to_enroll = [ x for x in students if x not in already_enrolled ]
print(to_enroll)
print("prev line is people to enroll\nnext line is students already enrolled in stem")
print(already_enrolled)
for s in to_enroll:
t = url + '/api/v1/courses/%s/enrollments' % stem_course_id
data = { 'enrollment[user_id]': s[1], 'enrollment[type]':'StudentEnrollment',
'enrollment[enrollment_state]': 'active' }
print(data)
print(t)
if input('enter to enroll %s or q to quit: ' % s[0]) == 'q':
break
r3 = requests.post(t, headers=header, params=data)
print(r3.text)
#####
##### from users.py pretty much just use sql now
# unused?
def getAllTeachersInTerm(): # a list
# classes taught in last 3 semesters
# How many of them were published and used
# hits in last week/month/year
# most common department
# email addr
all_courses = {}
teachers = {} # keyed by goo
# { 'name':'', 'id':'', 'email':'', 'goo':'', 'classes':[ (#name,#id,#pubd,#hitsbyteacher) ... ] }
# This is a bit different from the 1 year schedule above, because it looks at
# people who were active in their shells in iLearn.
outfile = codecs.open('teacherdata/historical_shells_used.json','w', encoding='utf-8')
for term in last_4_semesters_ids: # [60,]:
print(("Fetching term: " + str(term)))
all_courses[term] = \
fetch('/api/v1/accounts/1/courses?enrollment_term_id=' + str(term) + '&perpage=100')
i = 0
j = 0
for k,v in list(all_courses.items()): ##### term k, list v
for a_class in v:
print((a_class['name']))
published = 0
if a_class['workflow_state'] in ['available','completed']:
j += 1
published = 1
i += 1
#if i > 20: break
tch = fetch('/api/v1/courses/' + str(a_class['id']) + '/search_users?enrollment_type=teacher')
for r in tch: ##### TEACHER r of COURSE a_class
name = str(r['sortable_name'])
if not 'sis_import_id' in r:
print("This user wasn't available: " + name)
continue
goo = str(r['sis_import_id'])
print((r['sortable_name']))
if not name in teachers:
email = getEmail(r['id'])
teachers[name] = { 'name':r['sortable_name'], 'id':r['id'], 'email':email, 'goo':goo, 'classes':[] }
info = (a_class['name'],a_class['id'],published)
teachers[name]['classes'].append( info )
## TODO: hits in courses by teachers https://gavilan.instructure.com:443/api/v1/users/2/page_views?end_time=Dec%2010%2C%202018
for t,v in list(teachers.items()):
teachers[t]['num_courses'] = len(v['classes'])
teachers[t]['num_active_courses'] = sum( [x[2] for x in v['classes']] )
depts = [ dept_from_name(x[0]) for x in v['classes'] ]
teachers[t]['dept'] = most_common_item(depts)
#print(str(j), "/", str(i), " sections are published")
outfile.write(json.dumps(teachers))
"""
def teacherActivityLog(uid=1): ### Next: save results in a hash and return that....
global results, users, users_by_id
#get_users() # do this if you think 'teachers/users.json' is outdated.
load_users()
#for x in users_by_id.keys():
# if x < 20:
# print x
# print users_by_id[x]
teachers = csv.reader(open('teachers/current_semester.txt','r'), delimiter="\t")
for row in teachers:
print(row[0] + " is id: " + row[1])
uid = row[1]
print("Comes up as: " + str(users_by_id[int(uid)]))
info = users_by_id[int(uid)]
goo = info['login_id']
output_file = open('logs/users/byweek/'+ goo.lower() + '.csv', 'w')
# okay, actually, the first week here is the week before school IRL
start = isoweek.Week.withdate( datetime.date(2017,8,21))
end = isoweek.Week.thisweek()
byweek = []
i = 0
while(1):
results = []
start = start + 1
if start > end: break
myStart = start.day(0).isoformat() + 'T00:00-0700'
myEnd = start.day(6).isoformat() + 'T11:59:59-0700'
t = url + "/api/v1/users/" + str(uid) + "/page_views?start_time=" + myStart + '&end_time=' + myEnd + "&perpage=500"
print(t)
while(t):
print(".", end=' ')
t = fetch(t)
print("")
thisWeek = len(results)
print("Week # " + str(i) + "\t" + str(thisWeek))
byweek.append( "Week # " + str(i) + "\t" + str(thisWeek) )
output_file.write( start.isoformat() + "," + str(thisWeek) + "\n")
i += 1
for j in byweek:
print(j)
"""
"""
def summarize_student_teacher_role(u):
# u is a "group" from the groupby fxn
# term is sp18 now
t = 0
s = 0
for a in u:
if a=='TeacherEnrollment': t += 1
else: s += 1
if NUM_ONLY:
if t > s: return 'teacher'
return 'student'
else:
if t > s: return '1'
return '0'
"""
"""
def user_roles2():
# cross list users, classes enrolled, and their roles
global role_table, term_courses
role_table = enrollment_file()
user_table = users_file()
course_table = courses_file() # from canvas
term_table = term_file()
schedule = current_schedule() # from banner
# current semester
current = term_table[lambda d: d.course_section=='2018 Spring']
term_id = current['id'].values[0]
term_courses = course_table[lambda d: d.termid==term_id] # courses this semester
# add is_online flag (for courses listed in schedule as online-only)
term_courses['is_online'] = term_courses['code'].map( lambda x: course_is_online( get_crn_from_name(x) ) )
new_df = pd.DataFrame(columns=['type','oo','num'])
m = 0
data = []
for u in user_table.iterrows():
if m % 1000 == 0: print("on row " + str(m))
m += 1
data.append(categorize_user(u))
#if m > 1500: break
new_df = pd.DataFrame(data,columns=['i','type','onlineonly','numcls']).set_index('i')
print(new_df)
user_table = user_table.merge(new_df,left_index=True,right_index=True)
user_table.to_csv('canvas_data/users_online.csv')
"""
### IS THIS IN CANVAS_DATA.py?
""" Collate the raw logs into something more compact and useful. Version 1:
- # of accesses, user/day
- # of participations, user/day
-
- where day is the number of days into the semester. Classes shorter than 16 weeks should get a multiplier
-
- 2 initial goals:
a. data for statistics / clustering / regression / learning
b. data for visualization
"""
def req_to_db(fname_list):
fields = ','.join("id timestamp timestamp_year timestamp_month timestamp_day user_id course_id root_account_id course_account_id quiz_id discussion_id conversation_id assignment_id url user_agent http_method remote_ip interaction_micros web_application_controller web_applicaiton_action web_application_context_type web_application_context_id real_user_id session_id user_agent_id http_status http_version".split(" "))
sqlite_file = 'canvas_data/data.db'
conn = sqlite3.connect(sqlite_file)
c = conn.cursor()
# merge all requests into db
by_date_course = defaultdict( lambda: defaultdict(int) )
by_date_user = defaultdict( lambda: defaultdict(int) )
df_list = []
df_list_crs = []
users = defaultdict( lambda: defaultdict(int) )
i = 0
limit = 300
for fname in fname_list:
print((fname+"\n"))
for line in gzip.open('canvas_data/'+fname,'r'):
r = line.split('\t')
#tot = len(fields.split(','))
#i = 0
#for x in fields.split(','):
# print x + "\t" + r[i]
# i+= 1
qry = "insert into requests("+fields+") values (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"
conn.execute(qry, r)
# New method for below:
# read collated data from sqlite
# collate from more logs
# write back....?
"""
date = datetime.datetime.strptime( r['timestamp'], "%Y-%m-%d %H:%M:%S.%f" )
if r['userid'] in users:
users[r['userid']]['freq'] += 1
if users[r['userid']]['lastseen'] < date:
users[r['userid']]['lastseen'] = date
else:
users[r['userid']] = {"id":r['userid'], "lastseen":date, "freq":1}
by_date_course[ r['day'] ][ r['courseid'] ] += 1
by_date_user[ r['day'] ][ r['userid'] ] += 1
#if r['userid'] in by_user: by_user[r['userid']] += 1
#else: by_user[r['userid']] = 1
#if r['courseid'] in by_course: by_course[r['courseid']] += 1
#else: by_course[r['courseid']] = 1
#mylog.write("by_user = " + str(by_user))
df_list.append(pd.DataFrame(data=by_date_user))
df_list_crs.append(pd.DataFrame(data=by_date_course))
"""
i += 1
if i > limit: break
conn.commit()
conn.close()
"""
Making columns:
table_data = [['a', 'b', 'c'], ['aaaaaaaaaa', 'b', 'c'], ['a', 'bbbbbbbbbb', 'c']]
for row in table_data:
print("{: >20} {: >20} {: >20}".format(*row))
Transpose a matrix:
rez = [[m[j][i] for j in range(len(m))] for i in range(len(m[0]))]
"""
"""
ilearn_by_id = {}
ilearn_by_name = {}
for x in ilearn_list:
ilearn_by_id[x[3]] = x
ilearn_by_name[x[0]] = x
for ml in open('cache/teacher_manual_name_lookup.csv','r').readlines():
parts = ml.strip().split(',')
try:
manual_list[parts[0]] = ilearn_by_id[parts[1]]
except Exception as e:
print "Teacher missing: " + parts[0]
il_names = [ x[0] for x in ilearn_list ]
il_byname = {}
for x in ilearn_list: il_byname[x[0]] = x
sched_list_missed = [x for x in sched_list]
#
# key is long name (with middle name) from schedule, value is tuple with everything
name_lookup = manual_list
matches = []
#print ilearn_list
num_in_sched = len(sched_list)
num_in_ilearn = len(ilearn_list)
#for i in range(min(num_in_sched,num_in_ilearn)):
# print "|"+sched_list[i] + "|\t\t|" + ilearn_list[i][0] + "|"
print("Sched names: %i, iLearn names: %i" % (num_in_sched,num_in_ilearn))
for s in sched_list:
for t in il_names:
if first_last(s) == t:
#print ' MATCHED ' + s + ' to ' + t
sched_list_missed.remove(s)
try:
name_lookup[s] = ilearn_by_name[ first_last(s) ]
except Exception as e:
print "Teacher missing (2): " + s
il_names.remove(first_last(s))
matches.append(s)
print "Matched: " + str(matches)
print "\nDidn't match: " + str(len(sched_list_missed)) + " schedule names."
print "\nFinal results: "
print name_lookup
nlf = codecs.open('cache/sched_to_ilearn_names.json','w','utf-8')
nlf.write(json.dumps(name_lookup,indent=2))
# STRING DISTANCE
#sim = find_most_similar(s,i_names)
#print ' CLOSEST MATCHES to ' + s + ' are: ' + str(sim)
#mm.write(s+',\n')
"""
#ilearn_list = sorted(list(set(map(
# lambda x: #(tfi[x]['name'],tfi[x]['email'],tfi[x]['dept'],str(tfi[x]['id']),tfi[x]['goo']),
# tfi.keys()))))
#i_names = [ x[0] for x in ilearn_list ]
#print json.dumps(i_names,indent=2)
#return
# how to filter a dict based on values
# filtered = {k: v for k, v in course_combos.items() if v['dept'] == 'LIB' or v['dept'] == 'CSIS' }
# more pandas
# gapminder['continent'].unique()
#for name,group in bycode:
# #print name
# print name, " ", group['type']
#onl = gg.agg( lambda x: has_online(x) )
#ttl = gg.agg( lambda x: len(x) )
#ttl = ttl.rename(columns={'type':'total_sections'})
#onl.join(gg.agg( lambda x: has_hybrid(x) ),how='outer')
#onl.join(gg.agg( lambda x: has_lecture(x) ), how='outer')
#onl['num_sections'] = 0
#onl['num_lec'] = 0
#onl['num_online'] = 0
#all = pd.merge([onl,hyb,lec])
#print onl
#total=len, f2f=lambda x: ) set(x)
#{ 'num_sections': "count",
# 'num_lec': lambda x: 5,
# 'num_online': lambda x: 5 } )
#print gg
"""
def has_online(series):
# if any items of the series have the string 'online', return 1
for i in series:
if i == 'online': return 1
return 0
def has_lecture(series):
# if any items of the series have the string 'online', return 1
for i in series:
if i == 'online': return 1
return 0
def has_hybrid(series):
# if any items of the series have the string 'online', return 1
for i in series:
if i == 'hybrid': return 1
return 0
"""
#### RIGHT HERE IS WHERE I THINK... MAYBE THIS ISN'T THE RIGHT APPROACH. I DON'T SEEM
#### TO BE ABLE TO QUERY THE FACT BASE. IS THAT TRUE? SHOULD I JUST BE USING TABLES?
#### CHANGING COURSE... USE THE RULES TO UPDATE A DATABASE/TABLE/DATAFRAME
#### OR SET OF DICTS.
# ultimately i want this to be more flexible, so i can categorize degrees as 'available evening' etc
#
# Simple data structure. In this function, a degree is
""" degree = { 'name': 'History AA',
'blocks': [ { 'original_title':'xxx', 'rulecode':'u3',
'courses': [ {'code':'math1a', 'units': '3.0', 'wasonline':False },
{'code':'math2a', 'units': '3.0', 'wasonline':False },
{'code':'math3a', 'units': '3.0', 'wasonline':False } ] },
{ 'original_title':'xyz', 'rulecode':'a',
'courses': [ {'code':'math5a', 'units': '3.0', 'wasonline':False },
{'code':'math6a', 'units': '3.0', 'wasonline':False },
{'code':'math7a', 'units': '3.0', 'wasonline':False } ] } ] }
"""
# Wrapper to get 2 schedules at once
def dl_sched():
global SEMESTER, semester_begin, filename, short_sem
SEMESTER = 'Fall 2019'
short_sem = 'fa19'
semester_begin = strptime('08/26', '%m/%d')
filename = 'fa19_sched.json'
txt = login()
codecs.open('output/'+filename,'w').write( json.dumps( to_section_list(txt) ) )
#stats()
#reg_nums()
#todo: these semesters
SEMESTER = 'Summer 2019'
short_sem = 'su19'
semester_begin = strptime('06/17', '%m/%d')
filename = 'su19_sched.json'
txt = login()
codecs.open('output/'+filename,'w').write( json.dumps( to_section_list(txt) ) )
#stats()
#reg_nums()
# Send a personalized email regarding ZTC
def send_z_email(fullname, firstname, addr, courses_list):
FULLNAME = fullname #"Sabrina Lawrence"
FNAME = firstname # "Sabrina"
to_email = addr # "slawrence@gavilan.edu"
courses = courses_list # ["CSIS45", "CSIS85"]
course_template = "<a href='%s'>%s</a> &nbsp; &nbsp;"
url_template = "https://docs.google.com/forms/d/e/1FAIpQLSfZLQp6wHFEdqsmpZ7jz2Y8HtKLo8XTAhrE2fyvTDOEgquBDQ/viewform?usp=pp_url&entry.783353363=%s&entry.1130271051=%s" # % (FULLNAME, COURSE1)
bare_link = "https://forms.gle/pwZJHdWSkyvmH4L19"
COURSELINKS = ''
PLAINCOURSES = ''
for C in courses:
ut = url_template % (FULLNAME, C)
COURSELINKS += course_template % (ut, C)
PLAINCOURSES += C + " "
text_version = open('cache/ztc_mail1.txt','r').read()
html_version = open('cache/ztc_mail1_h.txt','r').read()
# replace these: $FNAME $COURSELINKS $LINK
email = re.sub( r'\$FNAME', FNAME, text_version )
email = re.sub( r'\$COURSELINKS', PLAINCOURSES, email )
email = re.sub( r'\$LINK', bare_link, email )
email_h = re.sub( r'\$FNAME', FNAME, html_version )
email_h = re.sub( r'\$COURSELINKS', COURSELINKS, email_h )
print(email_h+"\n\n"+email)
from O365 import Account
credentials = ('phowell@gavilan.edu', 'xxx')
client_secret = 'xxx' # expires 10/28/2020
tenant_id = "4ad609c3-9156-4b89-9496-0c0600aeb0bb"
# application client id: 29859402-fa55-4646-b717-752d90c61cde
account = Account(credentials, auth_flow_type='credentials', tenant_id=tenant_id)
if account.authenticate():
print('Authenticated!')
#account = Account(credentials)
#if account.authenticate(scopes=['message_all']):
# print('Authenticated!')
m = account.new_message()
m.to.add(addr)
m.subject = 'Quick question about your course textbook'
m.body = "email_h"
m.send()
"""
import smtplib
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
msg = MIMEMultipart('alternative')
msg['Subject'] = "Quick question about your course textbook"
msg['From'] = "gavdisted@gmail.com"
msg['To'] = to_email
msg.attach(MIMEText(email, 'plain'))
msg.attach(MIMEText(email_h, 'html'))
#s = smtplib.SMTP('smtp.gmail.com', 587)
#s.starttls()
#s.login("gavdisted", "xxx")
s = smtplib.SMTP_SSL('smtp.office365.com',587)
s.ehlo()
s.starttls()
s.login('phowell@gavilan.edu', 'xxx')
#s.sendmail(msg['From'], msg['To'], msg.as_string())
s.sendmail(msg['From'], msg['To'], "Testing")
s.quit()"""
def getInactiveTeachersInTerm(t=23): # a list
global results
teachers = {}
emails = {}
outfile = codecs.open('canvas/inactive_teachers.txt','w', encoding='utf-8')
efile = codecs.open('canvas/inactive_teachers_emails.txt','w', encoding='utf-8')
#yn = raw_input('All courses? y=all n=only active ')
#all = 0
#if yn=='y': all = 1
if not t:
t = askForTerms()
else: t = [ t, ]
for term in t:
r = url + '/api/v1/accounts/1/courses?enrollment_term_id=' + str(term) + '&perpage=100'
while(r): r = fetch(r)
all_courses = results #json.loads(results)
#print "All unpublished courses: "
i = 0
j = 0
for k in all_courses:
j += 1
if k['workflow_state'] != 'available':
i += 1
print(str(i), "\t", k['name'], "\t", k['workflow_state'])
results = []
t2 = url + '/api/v1/courses/' + str(k['id']) + '/search_users?enrollment_type=teacher'
while(t2): t2 = fetch(t2)
#print results
for r in results:
key = r['sortable_name'] + "\t" + str(r['id'])
#if not 'email' in r: pdb.set_trace()
emails[key] = str(r['sis_user_id'])
#print r
if key in teachers:
teachers[key].append(k['name'])
else:
teachers[key] = [ k['name'], ]
#print json.dumps(results, indent=4, sort_keys=True)
#a = raw_input()
print(str(i), "/", str(j), " sections are unpublished")
for t in list(emails.keys()):
efile.write(emails[t] + ", ")
for t in list(teachers.keys()):
outfile.write(t + "\t")
for c in teachers[t]:
outfile.write(c + ",")
outfile.write("\n")
#f.write(json.dumps(teachers, indent=4, sort_keys=True))
print("Output file is in ./teachers/current_semester.txt")
#print json.dumps(all_courses, indent=4, sort_keys=True)
"""for x in all_courses:
qry = '/api/v1/courses/' + str(course_id) + '/search_users?enrollment_type=teacher'
t = url + qry
while(t): t = fetch(t)
"""
#for t,v in teachers.items():
# outfile.write( "|".join( [ v['goo'], v['name'], v['email'], v['dept'], str(v['num_courses']), str(v['num_active_courses']) ] ) + "\n" )
#{"goo": "G00275722", "name": "Agaliotis, Paul", "num_courses": 1, "num_active_courses": 1, "id": 5092, "dept": "AMT", "classes": [["AMT120 POWERPLANT TECH FA18 10958", 5322, 1]], "email": "PAgaliotis@gavilan.edu"},
#for t in teachers.keys():
# outfile.write(t + "\t")
# for c in teachers[t]:
# outfile.write(c + ",")
# outfile.write("\n")
#f.write(json.dumps(teachers, indent=4, sort_keys=True))
#print "Output file is in ./teachers/current_semester.txt"
#print json.dumps(all_courses, indent=4, sort_keys=True)
"""for x in all_courses:
qry = '/api/v1/courses/' + str(course_id) + '/search_users?enrollment_type=teacher'
t = url + qry
while(t): t = fetch(t)
"""
def course_location(course):
if len(course[0]) > 13:
period = Set( [course_location_raw(course[0][13])], )
else:
period = Set()
if len(course) > 1:
period.add(course_location_raw(course[1][13]))
if len(course) > 2:
period.add(course_location_raw(course[2][13]))
if len(course) > 3:
period.add(course_location_raw(course[3][13]))
if len(course) > 4:
period.add(course_location_raw(course[4][13]))
if len(course) > 5:
period.add(course_location_raw(course[5][13]))
if 'TBA' in period:
period.remove('TBA')
period = list(period)
if len(period)==0:
return ''
if len(period)==1:
return period[0]
if len(period)==2 and 'Online' in period:
period.remove('Online')
return 'Hybrid at ' + period[0]
return '/'.join(period)
def course_time(course):
# is it morning, mid, or evening?
period = Set( [raw_course_time(course[0][7])], )
if len(course) > 1:
#time += ", " + course[1][7]
period.add(raw_course_time(course[1][7]))
if len(course) > 2:
#time += ", " + course[2][7]
period.add(raw_course_time(course[2][7]))
if len(course) > 3:
#time += ", " + course[3][7]
period.add(raw_course_time(course[3][7]))
if len(course) > 4:
#time += ", " + course[4][7]
period.add(raw_course_time(course[4][7]))
if len(course) > 5:
#time += ", " + course[5][7]
period.add(raw_course_time(course[5][7]))
#print raw_course_time(course[0][7]),
if 'TBA' in period:
period.remove('TBA')
period = list(period)
if len(period)==0:
return ''
if len(period)==1:
return period[0]
return '/'.join(period)
def course_teacher(course):
t = Set()
for c in course:
t.add(c[11])
return " / ".join(list(t))
def reg_nums():
courses = []
dates = []
sections = categorize()
today = todays_date_filename()
out = open(today+'.csv','w')
dates = {'loc':{}, 'time':{}, 'start':{}, 'teacher':{}}
i = 1
for f in os.listdir('.'):
m = re.search('reg_'+short_sem+'_(\d+)\.csv',f)
if m:
filein = open(f,'r').readlines()[1:]
d = m.group(1)
dates[d] = {}
for L in filein:
parts = L.split(',') # crn,code,sec,cmp,cred,name,days,time,cap,act,rem,teacher,date,loc
if not re.search('(\d+)',parts[0]): continue
if len(parts)<8: continue
if not parts[8]: continue
if float(parts[8])==0: continue
dates[d][parts[0] + " " + parts[1]] = (1.0* float(parts[9])) / float(parts[8])
if i == 1 and parts[0] in sections:
dates['loc'][parts[0] + " " + parts[1]] = course_location( sections[parts[0]] )
dates['time'][parts[0] + " " + parts[1]] = course_time(sections[parts[0]] )
dates['start'][parts[0] + " " + parts[1]] = course_start( sections[parts[0]] )
dates['teacher'][parts[0] + " " + parts[1]] = course_teacher( sections[parts[0]] )
#dates[d]['act'] = parts[9]
#dates[d]['nam'] = parts[5]
#dates[d]['onl'] = ''
#print parts
#if len(parts)>13 and parts[13]=='ONLINE': dates[d]['onl'] = 'online'
i += 1
"""for d in sorted(dates.keys()):
for c in d:
print d
print dates[d]['crs']"""
df = pd.DataFrame(dates)
df.to_csv(out)
# In the schedule, is this a class or a continuation of the class above?
def categorize():
# todo: must we open all these files?
dates = {}
files = sorted(os.listdir('.'))
files = list( filter( lambda x: re.search('reg(\d+)\.csv',x), files) )
files.reverse()
f = files[0]
filein = codecs.open(f,'r','utf-8').readlines()[1:]
sections = {}
this_section = []
for L in filein:
parts = L.strip().split(',') # crn,code,sec,cmp,cred,name,days,time,cap,act,rem,teacher,date,loc
parts = list( map( lambda x: clean_funny3(x), parts ) )
if not re.search('(\d+)',parts[0]): # This is a continuation
this_section.append(parts)
else: # this is a new section or the first line
if this_section:
sections[ this_section[0][0] ] = this_section
#print "Section: " + this_section[0][0] + " is: " + str(this_section) + "\n"
#print this_section[0][0] + "\t", course_start(this_section)
#print this_section[0][0] + "\t", course_time(this_section)
#print this_section[0][0] + "\t", course_location(this_section)
this_section = [ parts, ]
return sections
# Deprecated. call perl.
def constructSchedule():
term = raw_input("Name of html file? (ex: sp18.html) ")
os.chdir('make-web-sched')
cmd = 'perl make.pl ' + term
print "command: " + cmd
os.system(cmd)
"""
def fetch_dict(target,params={}):
# if there are more results, return the url for more fetching.
# else return false
#print target
global results_dict
r2 = requests.get(target, headers = header, params=params)
output = r2.text
if output.startswith('while('):
output = output[9:]
#print output
mycopy = results_dict.copy()
results_dict = {}
results_dict.update(json.loads(output))
results_dict.update(mycopy)
f.write(json.dumps(results_dict, indent=2))
#print "\n"
if ('link' in r2.headers):
links = r2.headers['link'].split(',')
for L in links:
ll = L.split(';')
link = ll[0].replace("<","")
link = link.replace(">","")
if re.search(r'next', ll[1]):
#print ll[1] + ":\t" + link
return link
return ""
"""
def get_schedule(term='201870', sem='fall'):
"""
sched_data = { 'term_in':term, 'sel_subj':'dummy', 'sel_day':'dummy',
'sel_schd':'dummy', 'sel_insm':'dummy', 'sel_camp':'dummy', 'sel_levl':'dummy', 'sel_sess':'dummy',
'sel_instr':'dummy', 'sel_ptrm':'dummy', 'sel_attr':'dummy', 'sel_subj':'%', 'sel_crse':'', 'sel_title':'',
'sel_schd':'%', 'sel_from_cred':'', 'sel_to_cred':'', 'sel_camp':'%', 'sel_ptrm':'%', 'sel_sess':'%',
'sel_attr':'%', 'begin_hh':'0', 'begin_mi':'0', 'begin_ap':'a', 'end_hh':'0', 'end_mi':'0', 'end_ap':'a' }
initial_headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch, br',
'Accept-Language':'en-US,en;q=0.8',
'Connection':'keep-alive',
'Host':'ssb.gavilan.edu',
'Upgrade-Insecure-Requests':'1',
} #'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' }
headers = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'en-US,en;q=0.8',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Content-Type':'application/x-www-form-urlencoded',
'Host':'ssb.gavilan.edu',
'Origin':'https://ssb.gavilan.edu',
'Referer':'https://ssb.gavilan.edu/prod/bwckgens.p_proc_term_date?p_calling_proc=bwckschd.p_disp_dyn_sched&p_term='+term,
'Upgrade-Insecure-Requests':'1',
} #'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' }
initial_url = 'https://ssb.gavilan.edu/prod/bwckgens.p_proc_term_date?p_calling_proc=bwckschd.p_disp_dyn_sched&p_term=' + term
sesh = requests.Session()
#r1 = sesh.get(initial_url,headers=initial_headers)
#sesh.headers.update(headers)
url = 'https://ssb.gavilan.edu/prod/bwckschd.p_get_crse_unsec'
r1 = sesh.get(initial_url)
r = sesh.post(url, data=sched_data)
print r.headers
data = r.text
out = open('data/temp/'+term+'.html','w')
out.write(data)
out.close()"""
os.system('perl parse_schedule.pl data/temp/' + term + '.html' + ' ' + sem)
#####
#####
##### conf.py ?
str="""355 985 1296
354 730 1295
353 319 1290
352 985 1289
351 813 1285
350 281 1285
349 267 1279
348 981 1252
347 994 1252
346 26 1250
345 757 1288
344 368 1288
343 1 1286
259 703 1295
256 693 1293
255 660 1292
254 1 1291
250 482 1287
246 2 1284
245 333 1283
244 27 1282
243 703 1281
242 730 1281
241 482 1280
239 211 1278
238 794 1278
237 2 1277
236 297 1276
235 831 1276
233 482 1251"""
for L in str.split("\n"):
(id,host,session) = L.split("\t")
qry = "INSERT INTO conf_signups (user,session,timestamp) VALUES (%s,%s,'2022-08-08 17:20:00');" % (host,session)
print(qry)
## sched.py
import codecs
import requests, re, csv, json, funcy, sys
from content import upload_page
def dates(s):
#print(s)
m = re.match(r'(\d\d\d\d)\-(\d\d)\-(\d\d)',s)
if m:
s = m.group(2) + "/" + m.group(3)
#print(s)
return s
# "Course Code","Start Date","End Date",Term,Delivery,CRN,Status,"Course Name","Course Description","Units/Credit hours","Instructor Last Name","Instructor First Name",Campus/College,"Meeting Days and Times","Pass/No Pass available?","Class Capacity","Available Seats","Waitlist Capacity","Current Waitlist Length","Meeting Locations","Course Notes",ZTC
# ACCT103,2021-06-14,2021-07-23,"Summer 2021",Online,80386,Active,"General Office Accounting","This course is designed to prepare students for entry-level office accounting positions. Emphasis is on practical accounting applications. This course has the option of a letter grade or pass/no pass. ADVISORY: Eligible for Mathematics 430."," 3.00","Valenzuela Roque",Karla,"Gavilan College"," ",T," 30"," 18"," 20"," 0",,,
def parse_www_csv_sched():
old_keys = [ "CRN","Course Code","Units/Credit hours","Course Name","Meeting Days and Times","Class Capacity","Available Seats","Waitlist Capacity","Current Waitlist Length","Instructor Last Name","Start Date","Meeting Locations","ZTC","Delivery","Campus/College","Status","Course Description","Pass/No Pass available?","Course Notes" ]
# "Instructor First Name","End Date","Term",
new_keys = [ "crn", "code","cred", "name", "days", "cap", "rem", "wl_cap", "wl_act", "teacher", "date", "loc", "ztc", "type", "site","status","desc","pnp","note" ]
# "time","act","wl_rem", "partofday",
url = "https://gavilan.edu/_files/php/current_schedule.csv"
sched_txt = requests.get(url).text.splitlines()
sched = {"Fall 2021":[], "Spring 2022":[], "Winter 2022":[], "Summer 2021":[]}
shortsems = {"Fall 2021":"fa21", "Spring 2022":"sp22", "Winter 2022":"wi22", "Summer 2021":"su21","Summer 2022":"su22","Fall 2022":"fa22"}
for row in csv.DictReader(sched_txt):
d = dict(row)
for (old_key,new_key) in zip(old_keys,new_keys):
d[new_key] = d.pop(old_key).strip()
d['teacher'] = d.pop('Instructor First Name').strip() + " " + d['teacher']
d['date'] = dates(d['date']) + '-' + dates(d.pop('End Date').strip())
d['term'] = shortsems[d.pop('Term')]
if d['cred'] == ".00":
d['cred'] = "0"
if d['type'] == "Online":
d["loc"] = "ONLINE"
d["site"] = "Online"
d["type"] = "online"
#d.pop('Instructor First Name').strip() + " " + d['teacher']
#d["code"] = d.pop("Course Code")
#d["crn"] = d.pop("CRN")
sched[row['Term']].append(d) #print(row)
print( json.dumps(sched,indent=2))
for k,v in sched.items():
print("%s: %i" % (k,len(v)))
for v in sched["Fall 2021"]:
print("%s\t %s\t %s\t %s" % ( v['code'], v['days'], v['type'], v['loc'] ))
#print("%s\t %s\t %s\t %s" % ( v['Course Code'], v['Meeting Days and Times'], v['Delivery'], v['Meeting Locations'] ))
def parse_json_test_sched():
j2 = open('cache/classes_json.json','r').readlines()
for L in j2:
o3 = json.loads(L)
print(json.dumps(o3,indent=2))
if __name__ == "__main__":
print ('')
options = {
1: ['fetch and parse the csv on www.', parse_www_csv_sched],
2: ['parse the test json file.', parse_json_test_sched ],
}
if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):
resp = int(sys.argv[1])
print("\n\nPerforming: %s\n\n" % options[resp][0])
else:
print ('')
for key in options:
print(str(key) + '.\t' + options[key][0])
print('')
resp = input('Choose: ')
# Call the function in the options dict
options[ int(resp)][1]()
def put_revised_pages():
course_num = '6862'
course_folder = '../course_temps/course_6862'
filein = codecs.open(course_folder+'/fullcourse.v2.html','r','utf-8')
my_titles = []
my_urls = []
my_bodys = []
started = 0
current_body = ""
for L in filein.readlines():
ma = re.search('^<h1>(.*)</h1>.*$',L)
if ma:
my_titles.append(ma.group(1))
my_urls.append(ma.group(2))
if started:
my_bodys.append(current_body)
current_body = ""
started = 1
else:
current_body += "\n" + L
my_bodys.append(current_body)
i = 0
for U in my_urls:
# and now upload it....lol
upload_page(course_num,U,my_bodys[i])
i += 1
# Upload pages. Local copy has a particular format.
# Appears to not be used
def put_course_pages():
course_num = '6862'
filein = codecs.open('cache/pages/course_'+str(course_num)+'.html','r','utf-8')
my_titles = []
my_urls = []
my_bodys = []
started = 0
current_body = ""
for L in filein.readlines():
ma = re.search('^###\s(.*)###\s(.*)$',L)
if ma:
my_titles.append(ma.group(1))
my_urls.append(ma.group(2))
if started:
my_bodys.append(current_body)
current_body = ""
started = 1
else:
current_body += "\n" + L
my_bodys.append(current_body)
i = 0
for U in my_urls:
# and now upload it....lol
upload_page(course_num,U,my_bodys[i])
i += 1
def freshdesk():
path = "C:\\Users\\peter\\Downloads\\freshdesk\\Solutions.xml"
soup = bs( codecs.open(path,'r','utf-8').read() ,features="lxml")
outpt = codecs.open('cache/faqs.txt','w')
out = ""
for a in soup.find_all('solution-article'):
print("TITLE\n"+a.find('title').get_text())
out += a.find('title').get_text()
"""for d in a.find_all('description'):
#print(d)
if d:
d = h.unescape(d.get_text())
e = stripper(d)
m = tomd.convert( e )
m = mycleaner(m)
print("\nDESCRIPTION\n"+m)"""
#print("\nWHAT IS THIS?\n" +
hh = a.find('desc-un-html').get_text()
d = h.unescape(hh)
e = stripper(d)
m = tomd.convert( e )
m = mycleaner(m)
print("\nDESCRIPTION\n"+m)
out += "\n\n" + m + "\n\n"
print("-----------\n\n")
outpt.write(out)
#### content.py
from pattern.web import plaintext, extension
from pattern.web import download
#from pattern import URL, MIMETYPE_IMAGE
from pattern.web import Crawler, DEPTH, FIFO, MIMETYPE_IMAGE, MIMETYPE_PDF
class GavCrawl(Crawler):
def visit(self, link, source=None):
print('visited:', repr(link.url), 'from:', link.referrer)
print(' ', link.url.mimetype)
#txt = plaintext(source, keep={'h1':[], 'h2':[], 'h3':[], 'h4':[], 'td':[], 'strong':[], 'b':[], 'a':['href'], 'img':['src'], 'ul':[], 'ol':[], 'li':[], 'dd':[], 'dt':[], 'i':[]})
#codecs.open(save_folder + '/' + mycleaner(clean_title(link.url)) + '.txt','w','utf-8').write(tomd.convert(txt))
codecs.open(save_folder + '/' + clean_fn(link.url) + '.txt','w','utf-8').write(trafilatura.extract(source,include_links=True, deduplicate=True, include_images=True, include_formatting=True))
def fail(self, link):
print('failed:', repr(link.url))
if re.search(r'\.pdf$', link.url):
m = re.search(r'\/([^\/]+\.pdf)$', link.url)
if m:
save_file = m.group(1)
print("saving to ", save_folder + '/' + save_file)
pdf_response = requests.get(link.url)
with open(save_folder + '/' + save_file, 'wb') as f:
f.write(pdf_response.content)
text = extract_text(save_folder + '/' + save_file)
#print(text)
codecs.open(save_folder + '/' + save_file + '.txt','w','utf-8').write(text)
else:
print("no match for pdf url: ", link.url)
for ext in ['jpg','jpeg','gif','webp']:
if re.search(r'\.'+ext+'$', link.url):
m = re.search(r'\/([^\/]+\.'+ext+')$', link.url)
if m:
save_file = m.group(1)
print("saving to ", save_folder + '/' + save_file)
pdf_response = requests.get(link.url)
with open(save_folder + '/' + save_file, 'wb') as f:
f.write(pdf_response.content)
else:
print('no match for '+ext+' url: ', link.url)
def crawl2():
#p = GavCrawl(links=['http://www.gavilan.edu/'], domains=['gavilan.edu', 'gavilan.curriqunet.com','www.boarddocs.com'], delay=0.75)
#p = GavCrawl(links=['https://gavilan.edu/finaid/2022-23DirectLoanApplication1.pdf'], domains=['gavilan.edu', 'gavilan.curriqunet.com','www.boarddocs.com'], delay=0.75)
p = GavCrawl(links=['https://gavilan.curriqunet.com/catalog/iq/1826'], domains=['gavilan.edu', 'gavilan.curriqunet.com','www.boarddocs.com'], delay=0.75)
while not p.done:
try:
p.crawl(method=DEPTH, cached=False, throttle=0.76)
except Exception as e:
print("Exception: ", e)
def samples():
crawler = Crawler(links=[], domains=[], delay=20.0, sort=FIFO)
url = URL('http://www.clips.ua.ac.bemedia/pattern_schema.gif')
print(url.mimetype in MIMETYPE_IMAGE)
#html = download('http://www.clips.ua.ac.be/', unicode=True)
s = URL('http://www.clips.ua.ac.be').download()
s = plaintext(s, keep={'h1':[], 'h2':[], 'strong':[], 'a':['href']})
# getting absolute urls
from pattern.web import URL, DOM, abs
url = URL('http://www.clips.ua.ac.be')
dom = DOM(url.download())
for link in dom('a'):
print(abs(link.attributes.get('href',''), base=url.redirect or url.string))
# get pdfs
from pattern.web import URL, PDF
url = URL('http://www.clips.ua.ac.be/sites/default/files/ctrs-002_0.pdf')
pdf = PDF(url.download())
print(pdf.string)