diff --git a/calendarget.py b/calendarget.py new file mode 100644 index 0000000..3fdd605 --- /dev/null +++ b/calendarget.py @@ -0,0 +1,158 @@ +from ast import FormattedValue +from googleapiclient.discovery import build +import datetime +from datetime import timedelta +from zoneinfo import ZoneInfo +import win32com.client + +from canvas_secrets import GOOGLE_API_KEY + +# Replace these with your own API key and Calendar ID. +calendars = {'peter_main':'peter.howell@gmail.com', + 'aly_and_peter':'5qgh1nv9g5on3am4vres9i451c@group.calendar.google.com', + 'tlc':'4aq36obt0q5jjr5p82p244qs7c@group.calendar.google.com', + 'birthdays':'4q73r3ern2k9k83t0orq6iqaac@group.calendar.google.com'} + + + +def to_my_timezone(d, md_table_format=0): + # Parse the datetime string into a timezone-aware datetime. + dt = datetime.datetime.fromisoformat(d) + + # Convert to Pacific Time. + dt_pacific = dt.astimezone(ZoneInfo("America/Los_Angeles")) + + # Format the datetime. Note: + # - %A: full weekday name (e.g., Thursday) + # - %B: full month name (e.g., April) + # - %d: day of the month (with leading zero, so we'll remove it later) + # - %I: hour in 12-hour format (with leading zero) + # - %M: minute (with leading zero) + # - %p: AM/PM indicator (will be in uppercase) + formatted = dt_pacific.strftime("%A, %B %d | %I:%M%p") + + # Remove a leading zero from the day and hour if present + formatted = formatted.replace(" 0", " ") + + # Convert the AM/PM indicator to lowercase + formatted = formatted.replace("AM", "am").replace("PM", "pm") + return formatted + #return dt_pacific.strftime("%Y-%m-%d %H:%M:%S %Z%z") + +def in_my_timezone(d, md_table_format=0): + # Parse the datetime string into a timezone-aware datetime. + dt = datetime.datetime.fromisoformat(d) + + # Convert to Pacific Time. + #dt_pacific = dt.astimezone(ZoneInfo("America/Los_Angeles")) + + # Format the datetime. Note: + # - %A: full weekday name (e.g., Thursday) + # - %B: full month name (e.g., April) + # - %d: day of the month (with leading zero, so we'll remove it later) + # - %I: hour in 12-hour format (with leading zero) + # - %M: minute (with leading zero) + # - %p: AM/PM indicator (will be in uppercase) + formatted = dt.strftime("%A, %B %d | %I:%M%p") + + # Remove a leading zero from the day and hour if present + formatted = formatted.replace(" 0", " ") + + # Convert the AM/PM indicator to lowercase + formatted = formatted.replace("AM", "am").replace("PM", "pm") + return formatted + +def gcal(): + # Build the service using the API key. + service = build('calendar', 'v3', developerKey=GOOGLE_API_KEY) + n = 30 + + + for name,id in calendars.items(): + # Get the current time in RFC3339 format (UTC). + now = datetime.datetime.utcnow().isoformat() + 'Z' + print(f'Getting the upcoming {n} events') + + events_result = service.events().list( + calendarId=id, + timeMin=now, + maxResults=n, + singleEvents=True, + orderBy='startTime' + ).execute() + events = events_result.get('items', []) + + if not events: + print('No upcoming events found.') + return + + print(f"| Date | Time | Event | Lead |") + print(f"|------|------|-------|------|") + for event in events: + # Depending on the event, the start time might be a date or dateTime. + start = event['start'].get('dateTime', event['start'].get('date')) + print(f"| {to_my_timezone(start,1)} | {event.get('summary', 'No Title')} | | |") + + +def ocal(): + + + # Initialize Outlook COM object. + outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI") + #outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI") + + #print(outlook) + + #print(dir(outlook)) + + #print(dir(outlook.Folders)) + + root_folder = outlook.Folders.Item(1) + print (f"Root folder: {root_folder.Name}") + + #And to know the names of the subfolders you have: + #print("\nFolders:") + #for folder in root_folder.Folders: + # print (" " + folder.Name) + + + + # Get the default calendar folder. + calendar_folder = outlook.GetDefaultFolder(9) # 9 refers to the Calendar folder + #print(calendar_folder) + #print(dir(calendar_folder)) + #print(calendar_folder.Items) + items = calendar_folder.Items + print("Total items in Calendar:", items.Count) + + # Define the time window for which to fetch events. + n = 14 + now = datetime.datetime.now() + end = now + timedelta(days=n) # next 7 days + + # Restrict the calendar items to the time window. + # The Outlook filter syntax uses dates in "mm/dd/yyyy hh:mm" format. + filter_start = now.strftime("%m/%d/%Y %H:%M") + filter_end = end.strftime("%m/%d/%Y %H:%M") + restriction = f"[Start] >= '{filter_start}' AND [End] <= '{filter_end}'" + + calendar_items = calendar_folder.Items + calendar_items.IncludeRecurrences = True + calendar_items.Sort("[Start]") + + #print(calendar_items) + + print(f"Calendar items in next {n} days:") + restricted_items = calendar_items.Restrict(restriction) + + for item in restricted_items: + #for item in calendar_items: + start_dt = item.Start # a COM datetime object + start = in_my_timezone(str(start_dt),1) + subject = item.Subject + print(f"{start} - {subject}") + + + +if __name__ == '__main__': + ocal() diff --git a/content.py b/content.py index 08c5a4d..8debc49 100644 --- a/content.py +++ b/content.py @@ -1,23 +1,24 @@ #saved_titles = json.loads( codecs.open('cache/saved_youtube_titles.json','r','utf-8').read() ) -from calendar import FRIDAY + +#from calendar import FRIDAY +#import html2markdown as h2m + +from typing import ItemsView import requests, codecs, os, re, json, sys, pypandoc -import webbrowser, bs4, trafilatura, pickle, tomd, checker -import html2markdown as h2m -from pipelines import header, fetch, url, put_file -from util import clean_title, to_file_friendly, minimal_string, stripper, mycleaner +from checker import safe_html +from pipelines import header, fetch, url +from util import clean_title, to_file_friendly from bs4 import BeautifulSoup as bs from html.parser import HTMLParser -from collections import defaultdict -from pdfminer.high_level import extract_text -from sentence_transformers import SentenceTransformer, util -h = HTMLParser() -pagebreak = '\n\n\n\n' +pagebreak = '\n\n\n\n
\n\n' DBG = 1 +items = [] + def d(s): global DBG if DBG: print(s) @@ -88,7 +89,7 @@ def test_forums(id=0): except: print("Course folder exists.") - index.extend( extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose) ) + index.extend( extract_forums(id, course_folder, item_id_to_index, verbose) ) print(json.dumps(index,indent=2)) def write_message(fd, view, participants): @@ -98,10 +99,13 @@ def write_message(fd, view, participants): write_message(fd, r, participants) fd.write("\n") -def extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose=0): +def extract_forums(id, course_folder, item_id_to_index, verbose=0): ### ### FORUMS ### + + global items + index = [] forum_f = course_folder + '/forums' headered = 0 @@ -136,7 +140,7 @@ def extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose=0 # write to running log of content in order of module if p['id'] in item_id_to_index: - items_inorder[ item_id_to_index[ p['id'] ] ] = f"

{title}

\n\n{message}\n\n{pagebreak}" + items[ item_id_to_index[ p['id'] ] ] = f"

{title}

\n\n{message}\n\n{pagebreak}" else: print(' This forum didnt seem to be in the modules list.') except Exception as e: @@ -151,14 +155,30 @@ def extract_forums(id, course_folder, items_inorder, item_id_to_index, verbose=0 - +# +# +# +# +# +# todo: include front page. +# todo: clean html +# todo: toc +# +# # Download everything interesting in a course to a local folder # Build a master file with the entire class content -def accessible_check(id=""): +def course_download(id=""): + global items + if not id: id = input("ID of course to check? ") - verbose = 1 - PAGES_ONLY = 1 + # temp hard code + #id = "21284" + + verbose = 0 + PAGES_ONLY = 0 + + videos_log = codecs.open('cache/accessible_check_log.txt','w','utf-8') save_file_types = ['application/pdf','application/docx','image/jpg','image/png','image/gif','image/webp','application/vnd.openxmlformats-officedocument.wordprocessingml.document'] @@ -167,14 +187,12 @@ def accessible_check(id=""): # reverse lookup into items array item_id_to_index = {} - # is it used? - items_inorder = ["" + courseinfo['name'] + "\n\n" + pagebreak,] - running_index = 1 modules = fetch('/api/v1/courses/' + str(id) + '/modules',verbose) # headers / module names - items = [] + items = [f"

{courseinfo['name']}

\n{pagebreak}",] + running_index = 1 for x in range(9000): items.append(0) video_link_list = [] @@ -192,7 +210,7 @@ def accessible_check(id=""): if I['type'] == 'SubHeader': #print('subheader: ' + str(I)) - items[running_index] = '

%s

\n' % str(json.dumps(I,indent=2)) + items[running_index] = f"

{I['title']}

\n" if I['type'] == 'Page': item_id_to_index[ I['page_url'] ] = running_index @@ -303,23 +321,22 @@ def accessible_check(id=""): else: t2 = fetch('/api/v1/courses/' + str(id) + '/pages/'+p['url'], verbose) if t2 and 'body' in t2 and t2['body']: - bb = bs(t2['body'],features="lxml") - a_links = bb.find_all('a') + soup_infolder = bs(t2['body'],features="lxml") + soup_in_main = bs(t2['body'],features="lxml") + a_links = soup_infolder.find_all('a') for A in a_links: href = A.get('href') if href and re.search( r'youtu',href): video_link_list.append( (A.get('href'), A.text, 'pages/'+easier_filename + ".html") ) - - page_images = bb.find_all('img') + # Images + page_images = soup_infolder.find_all('img') + page_image_paths = {} for I in page_images: src = I.get('src') if src: d(' - %s' % src) - #if re.search(r'eis-prod', src) or re.search(r'gavilan\.ins', src): - # d(' * skipping file behind passwords') - #else: try: r = requests.get(src,headers=header, stream=True) mytype = r.headers['content-type'] @@ -327,16 +344,74 @@ def accessible_check(id=""): r_parts = mytype.split("/") ending = r_parts[-1] - with open(pages_f + '/' + str(image_count) + "." + ending, 'wb') as fd: + if ending=='jpeg': ending = "jpg" + + img_full_path = f"{pages_f}/{str(image_count)}.{ending}" + local_src = f"{str(image_count)}.{ending}" + page_image_paths[src] = f"pages/{local_src}" + I['src'] = local_src + + with open(img_full_path, 'wb') as fd: for chunk in r.iter_content(chunk_size=128): fd.write(chunk) image_count += 1 except Exception as e: d( ' * Error downloading page image, %s' % str(e) ) - + + # Repeat for version for main file + page_main_images = soup_in_main.find_all('img') + for I in page_main_images: + src = I.get('src') + if src: + I['src'] = page_image_paths[src] + + + # STUDIO VIDEOS + # Regex pattern to match "custom_arc_media_id%3D" and capture everything + # until the next '&' or end of string + pattern = r"custom_arc_media_id%3D([^&]+)" + found_ids = [] + + replacement_tag = '''''' + + # Iterate over all