canvasapp/util.py



import re, csv
from collections import defaultdict
from bs4 import BeautifulSoup as bs
import pytz, datetime, dateutil, json
from datetime import timedelta
from dateutil import tz

import functools

from functools import reduce

def contains_key_value(lst, x, y):
    """
    Checks if a list contains a dictionary with a specific key-value pair.

    :param lst: List of dictionaries to search through.
    :param x: The key to look for.
    :param y: The value associated with the key.
    :return: True if a dictionary in the list contains the key-value pair, otherwise False.
    """
    return reduce(lambda acc, item: acc or (isinstance(item, dict) and item.get(x) == y), lst, False)

def find_dict_with_key_value(lst, x, y):
    """
    Finds the first dictionary in a list where the key x has the value y.

    :param lst: List of dictionaries to search through.
    :param x: The key to look for.
    :param y: The value associated with the key.
    :return: The first dictionary containing the key-value pair, or None if not found.
    """
    return next((d for d in lst if isinstance(d, dict) and d.get(x) == y), None)


def extract_key_values(lst, x):
    """
    Extracts the values of the given key from a list of dictionaries.

    :param lst: List of dictionaries to search through.
    :param x: The key to look for.
    :return: A list of values corresponding to the key.
    """
    return reduce(lambda acc, item: acc + [item[x]] if isinstance(item, dict) and x in item else acc, lst, [])

def stripper(s):
    REMOVE_ATTRIBUTES = [
    'lang','language','onmouseover','onmouseout','script','style','font',
    'dir','face','size','color','style','class','width','height','hspace',
    'border','valign','align','background','bgcolor','text','link','vlink',
    'alink','cellpadding','cellspacing']

    #doc = '''<html><head><title>Page title</title></head><body><p id="firstpara" align="center">This is <i>paragraph</i> <a onmouseout="">one</a>.<p id="secondpara" align="blah">This is <i>paragraph</i> <b>two</b>.</html>'''
    soup = bs(s, features='lxml')
    for tag in soup.recursiveChildGenerator():
        try:
            tag.attrs = {key:value for key,value in tag.attrs.iteritems()
                         if key not in REMOVE_ATTRIBUTES}
        except AttributeError:
            # 'NavigableString' object has no attribute 'attrs'
            pass
    return soup.prettify()

def mycleaner(s):
    s = re.sub(r'<br\s?\/>','\n',s)
    s = re.sub(r'<\/?b>','',s)
    s = re.sub(r' +',' ',s)
    s = re.sub(r'^[\s\t\r\n]+$','',s,flags=re.MULTILINE)
    s = re.sub('^ ','',s)
    return s


def print_table(table):
    longest_cols = [
        (max([len(str(row[i])) for row in table]) + 3)
        for i in range(len(table[0]))
    ]
    row_format = "".join(["{:>" + str(longest_col) + "}" for longest_col in longest_cols])
    for row in table:
        print(row_format.format(*row))

def remove_nl(str):
    return str.rstrip()

def UnicodeDictReader(utf8_data, **kwargs):
    csv_reader = csv.DictReader(utf8_data, **kwargs)
    for row in csv_reader:
        yield {str(key, 'utf-8'):str(value, 'utf-8') for key, value in iter(list(row.items()))}


def minimal_string(s):
    s = s.lower()
    s = re.sub(r'[^a-zA-Z0-9]',' ',s)
    s = re.sub(r'(\s+)',' ',s)
    s = s.strip()
    return s


def to_file_friendly(st):
    st = st.lower()
    st = re.sub( r"[^a-z0-9]+","_",st)
    return st

def clean_title(st):
    sq = re.sub( r"[^a-zA-Z0-9\.\-\!]"," ",st )
    if sq: st = sq
    if len(st)>50: return st[:50]+'...'
    return st

def int_or_zero(x):
    if x == None: return 0
    else: return int(x)

def float_or_zero(x):
    if x == None: return 0
    else: return float(x)

def match59(x):
    if x['links']['context']==7959: return True
    return False


def item_2(x): return x[2]

def unix_time_millis(dt):
    wst = pytz.timezone("US/Pacific")
    epoch = datetime.datetime.fromtimestamp(0)
    epoch = wst.localize(epoch)
    return (dt - epoch).total_seconds() * 1000.0

# ENGL250 returns ENGL
def dept_from_name(n):
    m = re.search('^([a-zA-Z]+)\s?[\d\/]+',n)
    if m: return m.group(1)
    print(("Couldn't find dept from: " + n))
    return ''

# ENGL250 returns 250
def num_from_name(n):
    m = re.search('^([a-zA-Z]+)\s?([\d\/]+[A-Z]?)',n)
    if m: return m.group(2)
    print(("Couldn't find num from: " + n))
    return ''

def most_common_item(li):
    d = defaultdict(int)
    for x in li:
        d[x] += 1
    s = sorted(iter(list(d.items())), key=lambda k_v: (k_v[1],k_v[0]), reverse=True)
    #pdb.set_trace()
    return s[0][0]

def srt_times(a,b):
    HERE = tz.tzlocal()
    da = dateutil.parser.parse(a)
    da = da.astimezone(HERE)
    db = dateutil.parser.parse(b)
    db = db.astimezone(HERE)
    diff = da - db
    return diff.seconds + diff.days * 24 * 3600

def how_long_ago(a):    # number of hours ago 'a' was...
    if not a: return 9999
    HERE = tz.tzlocal()
    d_now = datetime.datetime.now()
    d_now = d_now.replace(tzinfo=None)
    #d_now = d_now.astimezone(HERE)
    d_then = dateutil.parser.parse(a)
    d_then = d_then.replace(tzinfo=None)
    #d_then = d_then.astimezone(HERE)
    diff = d_now - d_then
    return (diff.seconds/3600) + (diff.days * 24) + 8  # add 8 hours to get back from UTC timezone

def partition(times_list):
    # get a list of times in this format: 2017-02-14T17:01:46Z
    # and break them into a list of sessions, [start, hits, minutes]
    minutes_till_new_session = 26
    global dd
    mm = ['x','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
    start = ""
    last = ""
    hits = 0
    delta = timedelta(minutes=26)
    HERE = tz.tzlocal()
    sessions = []

    sorted_times_list = sorted(times_list, srt_times)
    current_set = []
    timeline_times = []

    for T in sorted_times_list:
        dt_naive = dateutil.parser.parse(T)
        dt = dt_naive.astimezone(HERE)
        timeline_st = unix_time_millis(dt)

        timeline_et = timeline_st + (1 * 60 * 1000)  # always end 1 minute later....
        timeline_dict = {}
        timeline_dict['starting_time'] = timeline_st
        timeline_dict['ending_time'] = timeline_et
        timeline_times.append(timeline_dict)

        month = mm[ int(dt.strftime("%m"))]
        formatted = month + " " + dt.strftime("%d %H:%M")
        if not start:                      # start a new session
            start = dt
            start_f = formatted
            last = dt
            current_set.append(formatted)
            hits = 1
        else:                           #
            if dt > last + delta:       # too long. save sesh. start another, if hits > 2
                minutes = (last - start)
                minutes = (minutes.seconds / 60) + 5
                if hits > 2:
                    sessions.append( [start_f, hits, minutes,current_set] )
                start = dt
                start_f = formatted
                last = dt
                hits = 1
                current_set = [formatted]
            else:                       # put in current session
                last = dt
                current_set.append(formatted)
                hits += 1
    # save last sesh
    if (last):
        minutes = (last - start)
        minutes = (minutes.seconds / 60) + 5
        if hits > 2:
            sessions.append( [start_f,hits,minutes,current_set] )

    dd.write(json.dumps(timeline_times))

    return sessions