much better scraping and indexing

This commit is contained in:
Coding with Peter 2023-04-19 20:06:55 -07:00
parent b0e5e278a1
commit c5d9f6f288
4 changed files with 796 additions and 641 deletions

View File

@ -883,7 +883,9 @@ def crawl():
avoid = ['ezproxy','community\.gavilan\.edu','archive\/tag','archive\/category', 'my\.gavilan\.edu', 'augusoft',
'eis-prod', 'ilearn\.gavilan', 'mailto', 'cgi-bin', 'edu\/old\/schedule', ]
'eis-prod', 'ilearn\.gavilan', 'mailto', 'cgi-bin', 'edu\/old\/schedule',
'admit\/search\.php', 'GavilanTrusteeAreaMaps2022\.pdf', 'schedule\/2019', 'schedule\/2020', 'schedule\/2021',
'schedule\/2022', 'schedule\/previous', ]
class MySpider(scrapy.Spider):
name = 'myspider'
@ -903,9 +905,10 @@ def crawl():
def parse(self, response):
print('visited:', repr(response.url), 'status:', response.status)
done = 0
if re.search(r'\.pdf$', response.url):
m = re.search(r'\/([^\/]+\.'+ext+')$', response.url)
m = re.search(r'\/([^\/]+\.pdf)$', response.url)
if m:
print("saving to ", save_folder + '/' + clean_fn(response.url))
pdf_response = requests.get(response.url)
@ -913,8 +916,9 @@ def crawl():
f.write(pdf_response.content)
text = extract_text(save_folder + '/' + clean_fn(response.url))
codecs.open(save_folder + '/' + clean_fn(response.url) + '.txt','w','utf-8').write(text)
done = 1
for ext in ['doc','docx','ppt','pptx']:
for ext in ['doc','docx','ppt','pptx','rtf','xls','xlsx']:
if re.search(r'\.'+ext+'$', response.url):
m = re.search(r'\/([^\/]+\.'+ext+')$', response.url)
if m:
@ -923,20 +927,28 @@ def crawl():
with open(save_folder + '/' + clean_fn(response.url), 'wb') as f:
f.write(pdf_response.content)
#text = extract_text(save_folder + '/' + clean_fn(response.url) + '.txt')
output = pypandoc.convert_file(save_folder + '/' + clean_fn(response.url), 'html', extra_args=['--extract-media=%s' % hash ])
txt_output = trafilatura.extract(response.text,include_links=True, deduplicate=True, include_images=True, include_formatting=True)
pandoc_infile = save_folder + '/' + clean_fn(response.url)
pandoc_outfile = save_folder + '/' + clean_fn(response.url) + '.html'
print("pandoc in file: %s" % pandoc_infile)
print("pandoc outfile: %s" % pandoc_outfile)
pypandoc.convert_file(pandoc_infile, 'html', outputfile=pandoc_outfile, extra_args=['--from=%s' % ext, '--extract-media=%s' % save_folder + '/img' ])
pandoc_output = codecs.open(pandoc_outfile,'r','utf-8').read()
txt_output = trafilatura.extract(pandoc_output,include_links=True, deduplicate=True, include_images=True, include_formatting=True)
if txt_output:
codecs.open(save_folder + '/' + clean_fn(response.url) + '.txt','w','utf-8').write(txt_output)
done = 1
for ext in ['jpg','jpeg','gif','webp']:
for ext in ['jpg','jpeg','gif','webp','png','svg','bmp','tiff','tif','ico']:
if re.search(r'\.'+ext+'$', response.url):
m = re.search(r'\/([^\/]+\.'+ext+')$', response.url)
if m:
print("saving to ", save_folder + '/' + clean_fn(response.url))
print("saving to ", save_folder + '/img/' + clean_fn(response.url))
pdf_response = requests.get(response.url)
with open(save_folder + '/' + clean_fn(response.url), 'wb') as f:
with open(save_folder + '/img/' + clean_fn(response.url), 'wb') as f:
f.write(pdf_response.content)
done = 1
if not done:
f_out = codecs.open(save_folder + '/' + clean_fn(response.url) + '.txt','w','utf-8')
this_output = trafilatura.extract(response.text,include_links=True, deduplicate=True, include_images=True, include_formatting=True)
@ -946,7 +958,6 @@ def crawl():
links = response.css('a::attr(href)').getall()
# Follow each link and parse its contents
for link in links:
go = 1
full_link = response.urljoin(link)
@ -1065,6 +1076,137 @@ def search_embeddings():
print(f'Top {i+1}: {file} - {sentence} - (Score: {score})')
from whoosh import fields, columns
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, ID, STORED, NUMERIC
from whoosh.qparser import QueryParser
from whoosh.analysis import StemmingAnalyzer
def priority_from_url(url):
priority = 1
# url is like this: https++www.gavilan.edu+news+Newsletters.php.txt
m = re.search(r'gavilan\.edu\+(.*)\.\w\w\w\w?$',url)
if m:
address = m.group(1)
parts = address.split('+')
if parts[0] in ['accreditation','curriculum','senate','research','old','committee','board','styleguide']:
priority += 20
if parts[0] in ['news','IT','HOM','administration']:
priority += 10
if parts[0] == 'admit' and parts[1] == 'schedule':
priority += 10
if 'accreditation' in parts:
priority += 50
if re.search(r'hhh\.gavilan\.edu',url):
priority += 100
priority *= len(parts)
#print(priority, parts)
else:
priority *= 50
#print(priority, url)
return priority
def test_priority():
ff = os.listdir('cache/crawl')
for f in ff:
priority_from_url(f)
def displayfile(f,aslist=0):
lines = codecs.open('cache/crawl/' + f,'r','utf-8').readlines()
lines = [L.strip() for L in lines]
lines = [L for L in lines if L and not re.search(r'^\|$',L)]
if aslist:
return lines
return "\n".join(lines)
def any_match(line, words):
# true if any of the words are in line
for w in words:
if re.search(w, line, re.IGNORECASE):
return True
return False
def find_match_line(filename, query):
q_words = query.split(" ")
lines = codecs.open('cache/crawl/' + filename,'r','utf-8').readlines()
lines = [L.strip() for L in lines]
lines = [L for L in lines if L and not re.search(r'^\|$',L)]
lines = [L for L in lines if any_match(L, q_words)]
return "\n".join(lines)
def search_index():
s = ''
schema = Schema(url=STORED, title=TEXT(stored=True), content=TEXT, priority=fields.COLUMN(columns.NumericColumn("i")))
ix = open_dir("cache/searchindex")
#with ix.reader() as reader:
#print(reader.doc_count()) # number of documents in the index
#print(reader.doc_frequency("content", "example")) # number of documents that contain the term "example" in the "content" field
#print(reader.field_length("content")) # total number of terms in the "content" field
#print(reader.term_info("content", "example")) # information about the term "example" in the "content" field
#print(reader.dump()) # overview of the entire index
while s != 'q':
s = input("search or 'q' to quit: ")
if s == 'q':
return
# Define the query parser for the index
with ix.searcher() as searcher:
query_parser = QueryParser("content", schema=schema)
# Parse the user's query
query = query_parser.parse(s)
print(query)
# Search the index for documents matching the query
results = searcher.search(query, sortedby="priority")
# Print the results
i = 1
for result in results:
print(i, result) # result["url"], result["content"])
print(find_match_line(result['url'], s))
print()
i += 1
def create_search_index():
# Define the schema for the index
stem_ana = StemmingAnalyzer()
schema = Schema(url=STORED, title=TEXT(stored=True), content=TEXT, priority=fields.COLUMN(columns.NumericColumn("i")))
# Create a new index in the directory "myindex"
ix = create_in("cache/searchindex", schema)
# Open an existing index
#ix = open_dir("cache/searchindex")
# Define the writer for the index
writer = ix.writer()
# Index some documents
files = os.listdir('cache/crawl')
files.sort()
for f in files:
m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f)
if m:
print(f)
writer.add_document(url=f, title=m.group(1), content=displayfile(f), priority=priority_from_url(f))
writer.commit()
def create_embeddings():
model = SentenceTransformer('all-MiniLM-L6-v2')
files = os.listdir('cache/crawl')
@ -1074,9 +1216,7 @@ def create_embeddings():
for f in files:
m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f)
if m:
lines = codecs.open('cache/crawl/' + f,'r','utf-8').readlines()
lines = [L.strip() for L in lines]
lines = [L for L in lines if L]
lines = displayfile(f,1)
embeddings = model.encode(lines)
print("\n-----", f)
@ -1104,7 +1244,10 @@ if __name__ == "__main__":
9: ['clean text index', txt_clean_index],
10: ['make web dir struct', manual_index],
11: ['create search embeddings', create_embeddings],
12: ['do a search', search_embeddings],
12: ['create search index', create_search_index],
13: ['do an index search', search_index],
14: ['do a vector search', search_embeddings],
15: ['test priority', test_priority],
}
if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):

View File

@ -301,6 +301,614 @@ def serve():
"""
### interactive.py
"""class HelloWorldExample(object):
def make_teacher_rel(self, tchr, clss):
with self._driver.session() as tx:
tx.run("MERGE (tchr:Teacher {name: $tchr}) MERGE (tchr)-[:TEACHES]->(clss:Class {name: $clss})", \
tchr=tchr, clss=clss)
def __init__(self, uri, user, password):
self._driver = GraphDatabase.driver(uri, auth=(user, password))
def close(self):
self._driver.close()
def print_greeting(self, message):
with self._driver.session() as session:
greeting = session.write_transaction(self._create_and_return_greeting, message)
print(greeting)
@staticmethod
def _create_and_return_greeting(tx, message):
result = tx.run("CREATE (a:Greeting) "
"SET a.message = $message "
"RETURN a.message + ', from node ' + id(a)", message=message)
return result.single()[0]
"""
def make_teacher_rel(g, tchr, clss):
g.run("MERGE (tchr:Teacher {name: $tchr}) MERGE (tchr)-[:TEACHES]->(clss:Class {name: $clss})", \
tchr=tchr, clss=clss)
def testgraph():
gg = Graph("bolt://localhost:7687", auth=("neo4j", "asdf"))
#gg.run("DROP CONSTRAINT ON (tchr:Teacher) ASSERT tchr.name IS UNIQUE")
#gg.run("DROP CONSTRAINT ON (clss:Class) ASSERT clss.name IS UNIQUE")
#gg.run("CREATE INDEX ON :Teacher(name)")
#gg.run("CREATE INDEX ON :Class(name)")
stuff = json.loads( open('output/semesters/2020spring/sp20_sched.json','r').read())
# make lists of unique course code+name, teacher, locations
tch = {}
crs = {}
loc = {}
sem = Node("Semester", name="sp20")
for c in stuff:
if not c['teacher'] in tch:
tch[c['teacher']] = Node("Teacher", name=c['teacher'])
gg.create(tch[c['teacher']])
if not c['code'] in crs:
crs[ c['code'] ] = Node("Course section", name=c['name'], code=c['code'])
gg.create(crs[ c['code'] ])
if not c['loc'] in loc:
loc[ c['loc'] ] = Node("Location", loc=c['loc'])
gg.create(loc[ c['loc'] ])
sect = Node("Section", crn=int(c['crn']))
gg.create(Relationship(tch[c['teacher']], "TEACHES", sect ))
gg.create(Relationship(sect, "CLASS OF", crs[ c['code'] ] ))
gg.create(Relationship( sect, "LOCATED AT", loc[ c['loc'] ] ))
"""
for c in stuff:
print(c['crn'])
q = "CREATE (section:Section { Name: "+c['name']+", Code: "+c['code']+", Crn: "+c['crn']+", Teacher: "+c['teacher']+" })"
q = 'CREATE (section:Section { Name: "%s", Code: "%s", Crn: "%s", Teacher: "%s" })' % \
(c['name'], c['code'], c['crn'], c['teacher'])
gg.run(q)
"""
#gg = HelloWorldExample("bolt://localhost:7687", "neo4j", "asdf")
#gg.print_greeting("hi there world")
"""
make_teacher_rel(gg, "Peter Howell","CSIS 42")
make_teacher_rel(gg, "Alex Stoykov","CSIS 42")
make_teacher_rel(gg, "Sabrina Lawrence","CSIS 85")
make_teacher_rel(gg, "Peter Howell","CSIS 85")
"""
screen = 0
def Memoize( func):
"""
Memoize decorator
"""
cache = {}
@wraps(func)
def wrapper(*args):
if args not in cache:
cache[args] = func(*args)
return cache[args]
return wrapper
class MyRepl:
description = {
"switch ": "Switch stream. You can use either 'switch public' or 'switch mine'",
"home " : "Show your timeline. 'home 7' will show 7 tweet.",
"harry " : "a guys name.",
"homo " : "means the same.",
"view " : "'view @mdo' will show @mdo's home.",
"h " : "Show help.",
"t " : "'t opps' will tweet 'opps' immediately.",
"s " : "'s #AKB48' will search for '#AKB48' and return 5 newest tweets."
}
def startup(self, outfile):
global screen # make it self
self.g = {}
self.buf = {}
screen = None
self.enter_ary = [curses.KEY_ENTER,10]
self.delete_ary = [curses.KEY_BACKSPACE,curses.KEY_DC,8,127,263]
self.tab_ary = [9]
self.up_ary = [curses.KEY_UP]
self.down_ary = [curses.KEY_DOWN]
# Init curses screen
screen = curses.initscr()
screen.keypad(1)
curses.noecho()
try:
curses.start_color()
curses.use_default_colors()
for i in range(0, curses.COLORS):
curses.init_pair(i + 1, i, -1)
except curses.error:
pass
curses.cbreak()
self.g['height'] , self.g['width'] = screen.getmaxyx()
#print("Width: %i" % self.g['width'])
# Init color function
s = self
self.white = lambda x:curses_print_word(x,7) #0)
self.grey = lambda x:curses_print_word(x, 3) #3)1)
self.red = lambda x:curses_print_word(x,7) #2)
self.green = lambda x:curses_print_word(x, 3) #3)
self.yellow = lambda x:curses_print_word(x,7) #4)
self.blue = lambda x:curses_print_word(x,3)
self.magenta = lambda x:curses_print_word(x,7) #6)
self.cyan = lambda x:curses_print_word(x,7) #7)
self.colors_shuffle = [s.grey, s.red, s.green, s.yellow, s.blue, s.magenta, s.cyan]
self.cyc = itertools.cycle(s.colors_shuffle[1:])
self.index_cyc = itertools.cycle(range(1,8))
self.setup_command(outfile)
def set_my_dict(self,d):
self.description = d
@Memoize
def cycle_color(self, s):
"""
Cycle the colors_shuffle
"""
return next(self.cyc)
def ascii_art(self, text):
"""
Draw the Ascii Art
"""
fi = figlet_format(text, font='doom')
for i in fi.split('\n'):
self.curses_print_line(i,next(self.index_cyc))
def close_window(self, ):
"""
Close screen
"""
global screen
screen.keypad(0);
curses.nocbreak();
curses.echo()
curses.endwin()
def suggest(self, word):
"""
Find suggestion
"""
rel = []
if not word: return rel
word = word.lower()
for candidate in self.description:
ca = candidate.lower()
#if ca.startswith(word): rel.append(candidate)
for eachword in ca.split(" "):
if eachword.startswith(word):
rel.append(candidate)
return rel
def curses_print_word(self, word,color_pair_code):
"""
Print a word
"""
global screen
word = word.encode('utf8')
screen.addstr(word,curses.color_pair(color_pair_code))
def curses_print_line(self, line,color_pair_code):
"""
Print a line, scroll down if need
"""
global screen
line = line.encode('utf8')
y,x = screen.getyx()
if y - self.g['height'] == -3:
self.scroll_down(2,y,x)
screen.addstr(y,0,line,curses.color_pair(color_pair_code))
self.buf[y] = line, color_pair_code
elif y - self.g['height'] == -2:
self.scroll_down(3,y,x)
screen.addstr(y-1,0,line,curses.color_pair(color_pair_code))
self.buf[y-1] = line ,color_pair_code
else:
screen.addstr(y+1,0,line,curses.color_pair(color_pair_code))
self.buf[y+1] = line, color_pair_code
def redraw(self, start_y,end_y,fallback_y,fallback_x):
"""
Redraw lines from buf
"""
global screen
for cursor in range(start_y,end_y):
screen.move(cursor,0)
screen.clrtoeol()
try:
line, color_pair_code = self.buf[cursor]
screen.addstr(cursor,0,line,curses.color_pair(color_pair_code))
except:
pass
screen.move(fallback_y,fallback_x)
def scroll_down(self, noredraw,fallback_y,fallback_x):
"""
Scroll down 1 line
"""
global screen
# Recreate buf
# noredraw = n means that screen will scroll down n-1 line
trip_list = heapq.nlargest(noredraw-1,buf)
for i in buf:
if i not in trip_list:
self.buf[i] = self.buf[i+noredraw-1]
for j in trip_list:
buf.pop(j)
# Clear and redraw
screen.clear()
self.redraw(1,g['height']-noredraw,fallback_y,fallback_x)
def clear_upside(self, n,y,x):
"""
Clear n lines upside
"""
global screen
for i in range(1,n+1):
screen.move(y-i,0)
screen.clrtoeol()
screen.refresh()
screen.move(y,x)
def display_suggest(self, y,x,word):
"""
Display box of suggestion
"""
global screen
g = self.g
side = 2
# Check if need to print upside
upside = y+6 > int(g['height'])
# Redraw if suggestion is not the same as previous display
sug = self.suggest(word)
if sug != self.g['prev']:
# 0-line means there is no suggetions (height = 0)
# 3-line means there are many suggetions (height = 3)
# 5-line means there is only one suggetions (height = 5)
# Clear upside section
if upside:
# Clear upside is a bit difficult. Here it's seperate to 4 case.
# now: 3-lines / previous : 0 line
if len(sug) > 1 and not self.g['prev']:
self.clear_upside(3,y,x)
# now: 0-lines / previous :3 lines
elif not sug and len(g['prev'])>1:
self.redraw(y-3,y,y,x)
# now: 3-lines / previous :5 lines
elif len(sug) > 1 == len(g['prev']):
self.redraw(y-5,y-3,y,x)
self.clear_upside(3,y,x)
# now: 5-lines / previous :3 lines
elif len(sug) == 1 < len(g['prev']):
self.clear_upside(3,y,x)
# now: 0-lines / previous :5 lines
elif not sug and len(g['prev'])==1:
self.redraw(y-5,y,y,x)
# now: 3-lines / previous :3 lines
elif len(sug) == len(g['prev']) > 1:
self.clear_upside(3,y,x)
# now: 5-lines / previous :5 lines
elif len(sug) == len(g['prev']) == 1:
self.clear_upside(5,y,x)
screen.refresh()
else:
# Clear downside
screen.clrtobot()
screen.refresh()
self.g['prev'] = sug
if sug:
# More than 1 suggestion
if len(sug) > 1:
if len(sug) > 5: sug = sug[:5]
#needed_lenth = sum([len(i)+side for i in sug]) + side
needed_lenth = max( self.g['width']-5, sum([len(i)+side for i in sug]) + side)
print(self.g['width'])
print(word)
print(sug)
print(needed_lenth)
if upside:
win = curses.newwin(3,needed_lenth,y-3,0)
win.erase()
win.box()
win.refresh()
cur_width = side
for i in range(len(sug)):
if cur_width+len(sug[i]) > self.g['width']: break
screen.addstr(y-2,cur_width,sug[i],curses.color_pair(4))
cur_width += len(sug[i]) + side
if cur_width > self.g['width']:
break
else:
win = curses.newwin(3,needed_lenth,y+1,0)
win.erase()
win.box()
win.refresh()
cur_width = side
for i in range(len(sug)):
screen.addstr(y+2,cur_width,sug[i],curses.color_pair(4))
cur_width += len(sug[i]) + side
if cur_width > self.g['width']:
break
# Only 1 suggestion
else:
can = sug[0]
if upside:
win = curses.newwin(5,len(self.description[can])+2*side,y-5,0)
win.box()
win.refresh()
screen.addstr(y-4,side,can,curses.color_pair(4))
screen.addstr(y-2,side,self.description[can],curses.color_pair(3))
else:
win = curses.newwin(5,len(self.description[can])+2*side,y+1,0)
win.box()
win.refresh()
screen.addstr(y+2,side,can,curses.color_pair(4))
screen.addstr(y+4,side,self.description[can],curses.color_pair(3))
def inputloop(self, ):
"""
Main loop input
"""
global screen
word = ''
screen.addstr("\n" + self.g['prefix'],curses.color_pair(7))
while True:
# Current position
y,x = screen.getyx()
# Get char
event = screen.getch()
try :
char = chr(event)
except:
char = ''
# Test curses_print_line
if char == '?':
self.buf[y] = self.g['prefix'] + '?', 0
self.ascii_art('dtvd88')
# TAB to complete
elif event in self.tab_ary:
# First tab
try:
if not self.g['tab_cycle']:
self.g['tab_cycle'] = itertools.cycle(self.suggest(word))
suggestion = next(self.g['tab_cycle'])
# Clear current line
screen.move(y,len(self.g['prefix']))
screen.clrtoeol()
# Print out suggestion
word = suggestion
screen.addstr(y,len(self.g['prefix']),word)
self.display_suggest(y,x,word)
screen.move(y,len(word)+len(self.g['prefix']))
except:
pass
# UP key
elif event in self.up_ary:
if self.g['hist']:
# Clear current line
screen.move(y,len(self.g['prefix']))
screen.clrtoeol()
# Print out previous history
if self.g['hist_index'] > 0 - len(self.g['hist']):
self.g['hist_index'] -= 1
word = self.g['hist'][self.g['hist_index']]
screen.addstr(y,len(self.g['prefix']),word)
self.display_suggest(y,x,word)
screen.move(y,len(word)+len(self.g['prefix']))
# DOWN key
elif event in self.down_ary:
if self.g['hist']:
# clear current line
screen.move(y,len(self.g['prefix']))
screen.clrtoeol()
# print out previous history
if not self.g['hist_index']:
self.g['hist_index'] = -1
if self.g['hist_index'] < -1:
self.g['hist_index'] += 1
word = self.g['hist'][self.g['hist_index']]
screen.addstr(y,len(self.g['prefix']),word)
self.display_suggest(y,x,word)
screen.move(y,len(word)+len(self.g['prefix']))
# Enter key #### I should get the command out of there?
# #### Can I register a callback function?
elif event in self.enter_ary:
self.g['tab_cycle'] = None
self.g['hist_index'] = 0
self.g['hist'].append(word)
if word== 'q':
self.cleanup_command()
break;
self.display_suggest(y,x,'')
screen.clrtobot()
self.handle_command(word)
self.buf[y] = self.g['prefix'] + word, 0
# Touch the screen's end
if y - self.g['height'] > -3:
self.scroll_down(2,y,x)
screen.addstr(y,0,self.g['prefix'],curses.color_pair(7)) ## SHOW NEW PROMPT
else:
screen.addstr(y+1,0,self.g['prefix'],curses.color_pair(7))
word = ''
# Delete / Backspace
elif event in self.delete_ary:
self.g['tab_cycle'] = None
# Touch to line start
if x < len(self.g['prefix']) + 1:
screen.move(y,x)
word = ''
# Midle of line
else:
word = word[:-1]
screen.move(y,x-1)
screen.clrtoeol()
self.display_suggest(y,x,word)
screen.move(y,x-1)
# Another keys
else:
self.g['tab_cycle'] = None
# Explicitly print char
try:
screen.addstr(char)
word += char
self.display_suggest(y,x,word)
screen.move(y,x+1)
except ValueError as e: # got errors here when i adjusted the volume....
pass
# Reset
self.close_window()
def setup_command(self,outfile):
self.data = open(outfile,'a')
self.g['prev'] = None
self.g['tab_cycle'] = None
self.g['prefix'] = '[gav]: '
self.g['hist_index'] = 0
# Load history from previous session
try:
o = open('completer.hist')
self.g['hist'] = [i.strip() for i in o.readlines()]
except:
self.g['hist'] = []
def cleanup_command(self):
o = open('completer.hist','a')
o.write("\n".join(self.g['hist']))
o.close()
self.data.close()
def handle_command(self, cmd):
r1 = re.search( r'^n\s(.*)$',cmd)
if r1:
# new data collection mode
mode = r1.group(1)
self.g['prefix'] = "[" + mode + "]"
self.data.write("\n\n# %s\n" % mode)
else:
#winsound.Beep(440,300)
self.data.write(cmd + "\n")
self.data.flush()
def repl_staff():
tch = json.loads( open('cache/teacherdata/teachers.json','r').read() )
newdict = {}
for T in tch:
newdict[T['name']] = 'teacher with id ' + T['login_id']
c = MyRepl()
c.set_my_dict(newdict)
c.startup('cache/people_logs.txt')
c.inputloop()
def repl_degs():
tch = csv.reader( open('cache/attainment_masterlist.csv','r'),delimiter=",")
newdict = {}
num = 0
for row in tch:
if num==0:
pass
else:
d = ' '
if row[0]: d = row[0]
newdict[row[4]] = d
num += 1
#print(newdict)
#input('ready')
c = MyRepl()
c.set_my_dict(newdict)
#c.startup('cache/g_path_cluster2020_.txt')
# c.inputloop()
def repl():
repl_degs()
#input('ready')
c = MyRepl()
c.set_my_dict(newdict)
#c.startup('cache/g_path_cluster2020_.txt')
# c.inputloop()
def repl():
repl_degs()
### courses.py

View File

@ -31,8 +31,6 @@ else:
q = Queue()
HOST_NAME = '127.0.0.1' #
HOST_NAME = '192.168.1.6' #
HOST_NAME = '192.168.1.6' #
PORT_NUMBER = 8080 # Maybe set this to 9000.
@ -342,600 +340,3 @@ if __name__ == '__main__':
"""class HelloWorldExample(object):
def make_teacher_rel(self, tchr, clss):
with self._driver.session() as tx:
tx.run("MERGE (tchr:Teacher {name: $tchr}) MERGE (tchr)-[:TEACHES]->(clss:Class {name: $clss})", \
tchr=tchr, clss=clss)
def __init__(self, uri, user, password):
self._driver = GraphDatabase.driver(uri, auth=(user, password))
def close(self):
self._driver.close()
def print_greeting(self, message):
with self._driver.session() as session:
greeting = session.write_transaction(self._create_and_return_greeting, message)
print(greeting)
@staticmethod
def _create_and_return_greeting(tx, message):
result = tx.run("CREATE (a:Greeting) "
"SET a.message = $message "
"RETURN a.message + ', from node ' + id(a)", message=message)
return result.single()[0]
"""
def make_teacher_rel(g, tchr, clss):
g.run("MERGE (tchr:Teacher {name: $tchr}) MERGE (tchr)-[:TEACHES]->(clss:Class {name: $clss})", \
tchr=tchr, clss=clss)
def testgraph():
gg = Graph("bolt://localhost:7687", auth=("neo4j", "asdf"))
#gg.run("DROP CONSTRAINT ON (tchr:Teacher) ASSERT tchr.name IS UNIQUE")
#gg.run("DROP CONSTRAINT ON (clss:Class) ASSERT clss.name IS UNIQUE")
#gg.run("CREATE INDEX ON :Teacher(name)")
#gg.run("CREATE INDEX ON :Class(name)")
stuff = json.loads( open('output/semesters/2020spring/sp20_sched.json','r').read())
# make lists of unique course code+name, teacher, locations
tch = {}
crs = {}
loc = {}
sem = Node("Semester", name="sp20")
for c in stuff:
if not c['teacher'] in tch:
tch[c['teacher']] = Node("Teacher", name=c['teacher'])
gg.create(tch[c['teacher']])
if not c['code'] in crs:
crs[ c['code'] ] = Node("Course section", name=c['name'], code=c['code'])
gg.create(crs[ c['code'] ])
if not c['loc'] in loc:
loc[ c['loc'] ] = Node("Location", loc=c['loc'])
gg.create(loc[ c['loc'] ])
sect = Node("Section", crn=int(c['crn']))
gg.create(Relationship(tch[c['teacher']], "TEACHES", sect ))
gg.create(Relationship(sect, "CLASS OF", crs[ c['code'] ] ))
gg.create(Relationship( sect, "LOCATED AT", loc[ c['loc'] ] ))
"""
for c in stuff:
print(c['crn'])
q = "CREATE (section:Section { Name: "+c['name']+", Code: "+c['code']+", Crn: "+c['crn']+", Teacher: "+c['teacher']+" })"
q = 'CREATE (section:Section { Name: "%s", Code: "%s", Crn: "%s", Teacher: "%s" })' % \
(c['name'], c['code'], c['crn'], c['teacher'])
gg.run(q)
"""
#gg = HelloWorldExample("bolt://localhost:7687", "neo4j", "asdf")
#gg.print_greeting("hi there world")
"""
make_teacher_rel(gg, "Peter Howell","CSIS 42")
make_teacher_rel(gg, "Alex Stoykov","CSIS 42")
make_teacher_rel(gg, "Sabrina Lawrence","CSIS 85")
make_teacher_rel(gg, "Peter Howell","CSIS 85")
"""
screen = 0
def Memoize( func):
"""
Memoize decorator
"""
cache = {}
@wraps(func)
def wrapper(*args):
if args not in cache:
cache[args] = func(*args)
return cache[args]
return wrapper
class MyRepl:
description = {
"switch ": "Switch stream. You can use either 'switch public' or 'switch mine'",
"home " : "Show your timeline. 'home 7' will show 7 tweet.",
"harry " : "a guys name.",
"homo " : "means the same.",
"view " : "'view @mdo' will show @mdo's home.",
"h " : "Show help.",
"t " : "'t opps' will tweet 'opps' immediately.",
"s " : "'s #AKB48' will search for '#AKB48' and return 5 newest tweets."
}
def startup(self, outfile):
global screen # make it self
self.g = {}
self.buf = {}
screen = None
self.enter_ary = [curses.KEY_ENTER,10]
self.delete_ary = [curses.KEY_BACKSPACE,curses.KEY_DC,8,127,263]
self.tab_ary = [9]
self.up_ary = [curses.KEY_UP]
self.down_ary = [curses.KEY_DOWN]
# Init curses screen
screen = curses.initscr()
screen.keypad(1)
curses.noecho()
try:
curses.start_color()
curses.use_default_colors()
for i in range(0, curses.COLORS):
curses.init_pair(i + 1, i, -1)
except curses.error:
pass
curses.cbreak()
self.g['height'] , self.g['width'] = screen.getmaxyx()
#print("Width: %i" % self.g['width'])
# Init color function
s = self
self.white = lambda x:curses_print_word(x,7) #0)
self.grey = lambda x:curses_print_word(x, 3) #3)1)
self.red = lambda x:curses_print_word(x,7) #2)
self.green = lambda x:curses_print_word(x, 3) #3)
self.yellow = lambda x:curses_print_word(x,7) #4)
self.blue = lambda x:curses_print_word(x,3)
self.magenta = lambda x:curses_print_word(x,7) #6)
self.cyan = lambda x:curses_print_word(x,7) #7)
self.colors_shuffle = [s.grey, s.red, s.green, s.yellow, s.blue, s.magenta, s.cyan]
self.cyc = itertools.cycle(s.colors_shuffle[1:])
self.index_cyc = itertools.cycle(range(1,8))
self.setup_command(outfile)
def set_my_dict(self,d):
self.description = d
@Memoize
def cycle_color(self, s):
"""
Cycle the colors_shuffle
"""
return next(self.cyc)
def ascii_art(self, text):
"""
Draw the Ascii Art
"""
fi = figlet_format(text, font='doom')
for i in fi.split('\n'):
self.curses_print_line(i,next(self.index_cyc))
def close_window(self, ):
"""
Close screen
"""
global screen
screen.keypad(0);
curses.nocbreak();
curses.echo()
curses.endwin()
def suggest(self, word):
"""
Find suggestion
"""
rel = []
if not word: return rel
word = word.lower()
for candidate in self.description:
ca = candidate.lower()
#if ca.startswith(word): rel.append(candidate)
for eachword in ca.split(" "):
if eachword.startswith(word):
rel.append(candidate)
return rel
def curses_print_word(self, word,color_pair_code):
"""
Print a word
"""
global screen
word = word.encode('utf8')
screen.addstr(word,curses.color_pair(color_pair_code))
def curses_print_line(self, line,color_pair_code):
"""
Print a line, scroll down if need
"""
global screen
line = line.encode('utf8')
y,x = screen.getyx()
if y - self.g['height'] == -3:
self.scroll_down(2,y,x)
screen.addstr(y,0,line,curses.color_pair(color_pair_code))
self.buf[y] = line, color_pair_code
elif y - self.g['height'] == -2:
self.scroll_down(3,y,x)
screen.addstr(y-1,0,line,curses.color_pair(color_pair_code))
self.buf[y-1] = line ,color_pair_code
else:
screen.addstr(y+1,0,line,curses.color_pair(color_pair_code))
self.buf[y+1] = line, color_pair_code
def redraw(self, start_y,end_y,fallback_y,fallback_x):
"""
Redraw lines from buf
"""
global screen
for cursor in range(start_y,end_y):
screen.move(cursor,0)
screen.clrtoeol()
try:
line, color_pair_code = self.buf[cursor]
screen.addstr(cursor,0,line,curses.color_pair(color_pair_code))
except:
pass
screen.move(fallback_y,fallback_x)
def scroll_down(self, noredraw,fallback_y,fallback_x):
"""
Scroll down 1 line
"""
global screen
# Recreate buf
# noredraw = n means that screen will scroll down n-1 line
trip_list = heapq.nlargest(noredraw-1,buf)
for i in buf:
if i not in trip_list:
self.buf[i] = self.buf[i+noredraw-1]
for j in trip_list:
buf.pop(j)
# Clear and redraw
screen.clear()
self.redraw(1,g['height']-noredraw,fallback_y,fallback_x)
def clear_upside(self, n,y,x):
"""
Clear n lines upside
"""
global screen
for i in range(1,n+1):
screen.move(y-i,0)
screen.clrtoeol()
screen.refresh()
screen.move(y,x)
def display_suggest(self, y,x,word):
"""
Display box of suggestion
"""
global screen
g = self.g
side = 2
# Check if need to print upside
upside = y+6 > int(g['height'])
# Redraw if suggestion is not the same as previous display
sug = self.suggest(word)
if sug != self.g['prev']:
# 0-line means there is no suggetions (height = 0)
# 3-line means there are many suggetions (height = 3)
# 5-line means there is only one suggetions (height = 5)
# Clear upside section
if upside:
# Clear upside is a bit difficult. Here it's seperate to 4 case.
# now: 3-lines / previous : 0 line
if len(sug) > 1 and not self.g['prev']:
self.clear_upside(3,y,x)
# now: 0-lines / previous :3 lines
elif not sug and len(g['prev'])>1:
self.redraw(y-3,y,y,x)
# now: 3-lines / previous :5 lines
elif len(sug) > 1 == len(g['prev']):
self.redraw(y-5,y-3,y,x)
self.clear_upside(3,y,x)
# now: 5-lines / previous :3 lines
elif len(sug) == 1 < len(g['prev']):
self.clear_upside(3,y,x)
# now: 0-lines / previous :5 lines
elif not sug and len(g['prev'])==1:
self.redraw(y-5,y,y,x)
# now: 3-lines / previous :3 lines
elif len(sug) == len(g['prev']) > 1:
self.clear_upside(3,y,x)
# now: 5-lines / previous :5 lines
elif len(sug) == len(g['prev']) == 1:
self.clear_upside(5,y,x)
screen.refresh()
else:
# Clear downside
screen.clrtobot()
screen.refresh()
self.g['prev'] = sug
if sug:
# More than 1 suggestion
if len(sug) > 1:
if len(sug) > 5: sug = sug[:5]
#needed_lenth = sum([len(i)+side for i in sug]) + side
needed_lenth = max( self.g['width']-5, sum([len(i)+side for i in sug]) + side)
print(self.g['width'])
print(word)
print(sug)
print(needed_lenth)
if upside:
win = curses.newwin(3,needed_lenth,y-3,0)
win.erase()
win.box()
win.refresh()
cur_width = side
for i in range(len(sug)):
if cur_width+len(sug[i]) > self.g['width']: break
screen.addstr(y-2,cur_width,sug[i],curses.color_pair(4))
cur_width += len(sug[i]) + side
if cur_width > self.g['width']:
break
else:
win = curses.newwin(3,needed_lenth,y+1,0)
win.erase()
win.box()
win.refresh()
cur_width = side
for i in range(len(sug)):
screen.addstr(y+2,cur_width,sug[i],curses.color_pair(4))
cur_width += len(sug[i]) + side
if cur_width > self.g['width']:
break
# Only 1 suggestion
else:
can = sug[0]
if upside:
win = curses.newwin(5,len(self.description[can])+2*side,y-5,0)
win.box()
win.refresh()
screen.addstr(y-4,side,can,curses.color_pair(4))
screen.addstr(y-2,side,self.description[can],curses.color_pair(3))
else:
win = curses.newwin(5,len(self.description[can])+2*side,y+1,0)
win.box()
win.refresh()
screen.addstr(y+2,side,can,curses.color_pair(4))
screen.addstr(y+4,side,self.description[can],curses.color_pair(3))
def inputloop(self, ):
"""
Main loop input
"""
global screen
word = ''
screen.addstr("\n" + self.g['prefix'],curses.color_pair(7))
while True:
# Current position
y,x = screen.getyx()
# Get char
event = screen.getch()
try :
char = chr(event)
except:
char = ''
# Test curses_print_line
if char == '?':
self.buf[y] = self.g['prefix'] + '?', 0
self.ascii_art('dtvd88')
# TAB to complete
elif event in self.tab_ary:
# First tab
try:
if not self.g['tab_cycle']:
self.g['tab_cycle'] = itertools.cycle(self.suggest(word))
suggestion = next(self.g['tab_cycle'])
# Clear current line
screen.move(y,len(self.g['prefix']))
screen.clrtoeol()
# Print out suggestion
word = suggestion
screen.addstr(y,len(self.g['prefix']),word)
self.display_suggest(y,x,word)
screen.move(y,len(word)+len(self.g['prefix']))
except:
pass
# UP key
elif event in self.up_ary:
if self.g['hist']:
# Clear current line
screen.move(y,len(self.g['prefix']))
screen.clrtoeol()
# Print out previous history
if self.g['hist_index'] > 0 - len(self.g['hist']):
self.g['hist_index'] -= 1
word = self.g['hist'][self.g['hist_index']]
screen.addstr(y,len(self.g['prefix']),word)
self.display_suggest(y,x,word)
screen.move(y,len(word)+len(self.g['prefix']))
# DOWN key
elif event in self.down_ary:
if self.g['hist']:
# clear current line
screen.move(y,len(self.g['prefix']))
screen.clrtoeol()
# print out previous history
if not self.g['hist_index']:
self.g['hist_index'] = -1
if self.g['hist_index'] < -1:
self.g['hist_index'] += 1
word = self.g['hist'][self.g['hist_index']]
screen.addstr(y,len(self.g['prefix']),word)
self.display_suggest(y,x,word)
screen.move(y,len(word)+len(self.g['prefix']))
# Enter key #### I should get the command out of there?
# #### Can I register a callback function?
elif event in self.enter_ary:
self.g['tab_cycle'] = None
self.g['hist_index'] = 0
self.g['hist'].append(word)
if word== 'q':
self.cleanup_command()
break;
self.display_suggest(y,x,'')
screen.clrtobot()
self.handle_command(word)
self.buf[y] = self.g['prefix'] + word, 0
# Touch the screen's end
if y - self.g['height'] > -3:
self.scroll_down(2,y,x)
screen.addstr(y,0,self.g['prefix'],curses.color_pair(7)) ## SHOW NEW PROMPT
else:
screen.addstr(y+1,0,self.g['prefix'],curses.color_pair(7))
word = ''
# Delete / Backspace
elif event in self.delete_ary:
self.g['tab_cycle'] = None
# Touch to line start
if x < len(self.g['prefix']) + 1:
screen.move(y,x)
word = ''
# Midle of line
else:
word = word[:-1]
screen.move(y,x-1)
screen.clrtoeol()
self.display_suggest(y,x,word)
screen.move(y,x-1)
# Another keys
else:
self.g['tab_cycle'] = None
# Explicitly print char
try:
screen.addstr(char)
word += char
self.display_suggest(y,x,word)
screen.move(y,x+1)
except ValueError as e: # got errors here when i adjusted the volume....
pass
# Reset
self.close_window()
def setup_command(self,outfile):
self.data = open(outfile,'a')
self.g['prev'] = None
self.g['tab_cycle'] = None
self.g['prefix'] = '[gav]: '
self.g['hist_index'] = 0
# Load history from previous session
try:
o = open('completer.hist')
self.g['hist'] = [i.strip() for i in o.readlines()]
except:
self.g['hist'] = []
def cleanup_command(self):
o = open('completer.hist','a')
o.write("\n".join(self.g['hist']))
o.close()
self.data.close()
def handle_command(self, cmd):
r1 = re.search( r'^n\s(.*)$',cmd)
if r1:
# new data collection mode
mode = r1.group(1)
self.g['prefix'] = "[" + mode + "]"
self.data.write("\n\n# %s\n" % mode)
else:
#winsound.Beep(440,300)
self.data.write(cmd + "\n")
self.data.flush()
def repl_staff():
tch = json.loads( open('cache/teacherdata/teachers.json','r').read() )
newdict = {}
for T in tch:
newdict[T['name']] = 'teacher with id ' + T['login_id']
c = MyRepl()
c.set_my_dict(newdict)
c.startup('cache/people_logs.txt')
c.inputloop()
def repl_degs():
tch = csv.reader( open('cache/attainment_masterlist.csv','r'),delimiter=",")
newdict = {}
num = 0
for row in tch:
if num==0:
pass
else:
d = ' '
if row[0]: d = row[0]
newdict[row[4]] = d
num += 1
#print(newdict)
#input('ready')
c = MyRepl()
c.set_my_dict(newdict)
#c.startup('cache/g_path_cluster2020_.txt')
# c.inputloop()
def repl():
repl_degs()
#input('ready')
c = MyRepl()
c.set_my_dict(newdict)
#c.startup('cache/g_path_cluster2020_.txt')
# c.inputloop()
def repl():
repl_degs()

View File

@ -20,6 +20,9 @@ import socket
this_host = socket.gethostname()
print('\n\n server host: ' + this_host, '\n\n')
datafile2 = "cache/datafile.txt"
LECPATH = "/media/hd2/peter_home_offload/lecture/"
host = 'http://192.168.1.6:5000'
news_path = '/media/hd2/peter_home/Documents/scripts/browser/'