From 0dde58d6844598d2fc08636009cf65f8917d1f9d Mon Sep 17 00:00:00 2001 From: zachary62 Date: Fri, 27 Dec 2024 05:29:24 +0000 Subject: [PATCH] add examples --- .gitignore | 3 +- cookbook/demo.ipynb | 267 ++++++++++++++++ data/PaulGrahamEssaysLarge/addiction.txt | 116 +++++++ data/PaulGrahamEssaysLarge/aord.txt | 126 ++++++++ data/PaulGrahamEssaysLarge/apple.txt | 201 ++++++++++++ data/PaulGrahamEssaysLarge/avg.txt | 375 ++++++++++++++++++++++ data/PaulGrahamEssaysLarge/before.txt | 387 +++++++++++++++++++++++ docs/prompt | 223 ++++++++++++- minillmflow/__init__.py | 30 +- setup.py | 2 +- tests/test_async_batch_flow.py | 10 +- tests/test_async_flow.py | 18 +- tests/test_batch_flow.py | 12 +- tests/test_batch_node.py | 10 +- tests/test_flow_basic.py | 10 +- tests/test_flow_composition.py | 22 +- 16 files changed, 1753 insertions(+), 59 deletions(-) create mode 100644 cookbook/demo.ipynb create mode 100644 data/PaulGrahamEssaysLarge/addiction.txt create mode 100644 data/PaulGrahamEssaysLarge/aord.txt create mode 100644 data/PaulGrahamEssaysLarge/apple.txt create mode 100644 data/PaulGrahamEssaysLarge/avg.txt create mode 100644 data/PaulGrahamEssaysLarge/before.txt diff --git a/.gitignore b/.gitignore index 13a1aac..884a33a 100644 --- a/.gitignore +++ b/.gitignore @@ -71,4 +71,5 @@ htmlcov/ *.temp -test.ipynb \ No newline at end of file +test.ipynb +.pytest_cache/ \ No newline at end of file diff --git a/cookbook/demo.ipynb b/cookbook/demo.ipynb new file mode 100644 index 0000000..a5a6f3c --- /dev/null +++ b/cookbook/demo.ipynb @@ -0,0 +1,267 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No relevant file found: the question has no relevant file because while some files discuss startups, none specifically address how to find or generate startup ideas\n", + "No question asked\n" + ] + }, + { + "data": { + "text/plain": [ + "'default'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Example App for text summarization & QA using minillmflow\n", + "from minillmflow import Node, BatchNode, Flow, BatchFlow, AsyncNode, AsyncFlow, BatchAsyncFlow\n", + "import os\n", + "\n", + "# 1) Implement a simple LLM helper (OpenAI in this example).\n", + "def call_LLM(prompt):\n", + " # Users must set an OpenAI API key; can also load from env var, etc.\n", + " openai.api_key = \"YOUR_API_KEY_HERE\"\n", + " r = openai.ChatCompletion.create(\n", + " model=\"gpt-4\",\n", + " messages=[{\"role\": \"user\", \"content\": prompt}]\n", + " )\n", + " return r.choices[0].message.content\n", + "\n", + "# 2) Create a shared store (dict) for Node/Flow data exchange.\n", + "# This can be replaced with a DB or other storage.\n", + "# Design the structure / schema based on the app requirements.\n", + "shared = {\"data\": {}, \"summary\": {}}\n", + "\n", + "# 3) Create a Node that loads data from disk into shared['data'].\n", + "class LoadData(Node):\n", + " # For compute-intensive operations, do them in prep().\n", + " def prep(self, shared):\n", + " path = \"../data/PaulGrahamEssaysLarge\"\n", + " for filename in os.listdir(path):\n", + " with open(os.path.join(path, filename), 'r') as f:\n", + " shared['data'][filename] = f.read()\n", + " # If LLM was needed, we'd handle it in exec(). Not needed here.\n", + " # (idempotent so it can be retried if needed)\n", + " def exec(self,shared,prep_res): pass \n", + " # post() can update shared again or decide the next node (by return the action).\n", + " def post(self,shared,prep_res,exec_res): pass \n", + "\n", + "load_data = LoadData()\n", + "# Run the data-loading node once\n", + "load_data.run(shared)\n", + "\n", + "# 4) Create a Node that summarizes a single file using the LLM.\n", + "class SummarizeFile(Node):\n", + " def prep(self, shared):\n", + " # Use self.params (which must remain immutable during prep/exec/post).\n", + " # Typically, we only store identifying info in params (e.g., filename).\n", + " content = shared['data'][self.params['filename']]\n", + " return content\n", + " def exec(self, shared, prep_res):\n", + " content = prep_res\n", + " prompt = f\"{content} Respond a summary of above in 10 words\"\n", + " summary = call_llm(prompt)\n", + " return summary\n", + " def post(self, shared, prep_res, exec_res):\n", + " shared[\"summary\"][self.params['filename']] = exec_res\n", + "\n", + "summarize_file = SummarizeFile()\n", + "# For testing, we set params directly on the node.\n", + "# In real usage, you'd set them in a Flow or BatchFlow.\n", + "summarize_file.set_params({\"filename\":\"addiction.txt\"})\n", + "summarize_file.run(shared)\n", + "\n", + "# 5) If data is large, we can apply a map-reduce pattern:\n", + "# - MapSummaries(BatchNode) => chunk the file and summarize each chunk\n", + "# - ReduceSummaries(Node) => combine those chunk-level summaries\n", + "class MapSummaries(BatchNode):\n", + " def prep(self, shared):\n", + " content = shared['data'][self.params['filename']]\n", + " chunk_size = 10000\n", + " chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]\n", + " # Must return an iterable (list or generator) for a BatchNode.\n", + " return chunks\n", + " def exec(self, shared, prep_res):\n", + " # Each iteration of prep_res corresponds to a single chunk.\n", + " chunk = prep_res\n", + " prompt = f\"{chunk} Respond a summary of above in 10 words\"\n", + " summary = call_llm(prompt)\n", + " return summary\n", + " def post(self, shared, prep_res, exec_res):\n", + " # exec_res is a list of exec() results (summaries for each chunk).\n", + " combined_summary = [f\"{i}. {summary}\" for i, summary in enumerate(exec_res)]\n", + " shared[\"summary\"][self.params['filename']] = combined_summary\n", + "\n", + "class ReduceSummaries(Node):\n", + " def prep(self, shared):\n", + " # Retrieve the list of chunk summaries from shared storage\n", + " return shared[\"summary\"][self.params['filename']]\n", + " def exec(self, shared, prep_res):\n", + " combined_summary = prep_res\n", + " prompt = f\"{combined_summary} Respond a summary of above in 10 words\"\n", + " summary = call_llm(prompt)\n", + " return summary\n", + " def post(self, shared, prep_res, exec_res):\n", + " # Store the combined summary as the final summary for this file.\n", + " shared[\"summary\"][self.params['filename']] = exec_res\n", + " \n", + "map_summaries = MapSummaries()\n", + "reduce_summaries = ReduceSummaries()\n", + "# Link map_summaries to reduce_summaries with an action\n", + "# By default, the action is \"default\" (when post returns None, it takes \"default\" action)\n", + "# This is the same as map_summaries - \"default\" >> reduce_summaries\n", + "map_summaries >> reduce_summaries\n", + "\n", + "# We don't directly call map_summaries.run(shared), \n", + "# because that alone would process only the map step without reduce.\n", + "\n", + "# 6) Instead, create a Flow that starts from map_summaries (a Node) \n", + "# and automatically includes reduce_summaries. \n", + "# Note: A Flow can also start from any other Flow or BatchFlow.\n", + "\n", + "\n", + "file_summary_flow = Flow(start=map_summaries)\n", + "# When a flow params is set, it will recursively set its params to all nodes in the flow\n", + "file_summary_flow.set_params({\"filename\":\"before.txt\"})\n", + "file_summary_flow.run(shared)\n", + "\n", + "# 7) Summarize all files using a BatchFlow that reruns file_summary_flow for each file\n", + "class SummarizeAllFiles(BatchFlow):\n", + " def prep(self, shared):\n", + " # Return a list of parameters to apply in each flow iteration.\n", + " # Each individual param will be merged with this node's own params \n", + " # Allowing nesting of multi-level BatchFlow. \n", + " # E.g., first level diretcory, second level file.\n", + " return [{\"filename\":filename} for filename in shared['data']]\n", + "\n", + "summarize_all_files = SummarizeAllFiles(start=file_summary_flow)\n", + "summarize_all_files.run(shared)\n", + "\n", + "\n", + "# 8) QA Agent: Find the most relevant file based on summary with actions\n", + "# if no question is asked:\n", + "# (a) end: terminate the flow \n", + "# if question is asked:\n", + "# if relevant file is found:\n", + "# (b) answer: move to answer node and read the whole file to answer the question\n", + "# if no relevant file is found:\n", + "# (c) retry: retry the process to find the relevant file\n", + "class FindRelevantFile(Node):\n", + " def prep(self, shared):\n", + " question = input(\"Enter a question: \")\n", + " formatted_list = [f\"- '{filename}': {shared['summary'][filename]}\" \n", + " for filename in shared['summary']]\n", + " return question, formatted_list\n", + " def exec(self, shared, prep_res):\n", + " question, formatted_list = prep_res\n", + " if not question:\n", + " return {\"think\":\"no question\", \"has_relevant\":False}\n", + " # Provide a structured YAML output that includes:\n", + " # - The chain of thought\n", + " # - Whether any relevant file was found\n", + " # - The most relevant file if found\n", + " prompt = f\"\"\"Question: {question} \n", + "Find the most relevant file from: \n", + "{formatted_list}\n", + "If no relevant file, explain why\n", + "Respond in yaml without additional information:\n", + "think: the question has/has no relevant file ...\n", + "has_relevant: true/false\n", + "most_relevant: filename\"\"\"\n", + " response = call_llm(prompt)\n", + " import yaml\n", + " result = yaml.safe_load(response)\n", + " # Ensure required fields are present\n", + " assert \"think\" in result\n", + " assert \"has_relevant\" in result\n", + " assert \"most_relevant\" in result if result[\"has_relevant\"] else True\n", + " return result\n", + " # handle errors by returning a default response in case of exception after retries\n", + " def process_after_fail(self,shared,prep_res,exc):\n", + " # if not overridden, the default is to throw the exception\n", + " return {\"think\":\"error finding the file\", \"has_relevant\":False}\n", + " def post(self, shared, prep_res, exec_res):\n", + " question, _ = prep_res\n", + " # Decide what to do next based on the results\n", + " if not question:\n", + " print(f\"No question asked\")\n", + " return \"end\"\n", + " if exec_res[\"has_relevant\"]:\n", + " # Store the question and most relevant file in shared\n", + " shared[\"question\"] = question\n", + " shared[\"relevant_file\"] = exec_res['most_relevant']\n", + " print(f\"Relevant file found: {exec_res['most_relevant']}\")\n", + " return \"answer\"\n", + " else:\n", + " print(f\"No relevant file found: {exec_res['think']}\")\n", + " return \"retry\"\n", + "\n", + "class AnswerQuestion(Node):\n", + " def prep(self, shared):\n", + " question = shared['question']\n", + " relevant_file = shared['relevant_file']\n", + " # Read the whole file content\n", + " file_content = shared['data'][relevant_file]\n", + " return question, file_content\n", + " def exec(self, shared, prep_res):\n", + " question, file_content = prep_res\n", + " prompt = f\"\"\"Question: {question}\n", + "File: {file_content}\n", + "Answer the question in 50 words\"\"\"\n", + " response = call_llm(prompt)\n", + " return response\n", + " def post(self, shared, prep_res, exec_res):\n", + " print(f\"Answer: {exec_res}\")\n", + "\n", + "class NoOp(Node):\n", + " pass\n", + "\n", + "# Configure the QA agent with appropriate transitions and retries\n", + "find_relevant_file = FindRelevantFile(max_retries=3)\n", + "answer_question = AnswerQuestion()\n", + "no_op = NoOp()\n", + "\n", + "# Connect the nodes based on the actions they return\n", + "find_relevant_file - \"answer\" >> answer_question >> find_relevant_file\n", + "find_relevant_file - \"retry\" >> find_relevant_file\n", + "find_relevant_file - \"end\" >> no_op\n", + "\n", + "qa_agent = Flow(start=find_relevant_file)\n", + "qa_agent.run(shared)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/data/PaulGrahamEssaysLarge/addiction.txt b/data/PaulGrahamEssaysLarge/addiction.txt new file mode 100644 index 0000000..ff11c9f --- /dev/null +++ b/data/PaulGrahamEssaysLarge/addiction.txt @@ -0,0 +1,116 @@ +July 2010What hard liquor, cigarettes, heroin, and crack have in common is +that they're all more concentrated forms of less addictive predecessors. +Most if not all the things we describe as addictive are. And the +scary thing is, the process that created them is accelerating.We wouldn't want to stop it. It's the same process that cures +diseases: technological progress. Technological progress means +making things do more of what we want. When the thing we want is +something we want to want, we consider technological progress good. +If some new technique makes solar cells x% more efficient, that +seems strictly better. When progress concentrates something we +don't want to want—when it transforms opium into heroin—it seems +bad. But it's the same process at work. +[1]No one doubts this process is accelerating, which means increasing +numbers of things we like will be transformed into things we like +too much. +[2]As far as I know there's no word for something we like too much. +The closest is the colloquial sense of "addictive." That usage has +become increasingly common during my lifetime. And it's clear why: +there are an increasing number of things we need it for. At the +extreme end of the spectrum are crack and meth. Food has been +transformed by a combination of factory farming and innovations in +food processing into something with way more immediate bang for the +buck, and you can see the results in any town in America. Checkers +and solitaire have been replaced by World of Warcraft and FarmVille. +TV has become much more engaging, and even so it can't compete with Facebook.The world is more addictive than it was 40 years ago. And unless +the forms of technological progress that produced these things are +subject to different laws than technological progress in general, +the world will get more addictive in the next 40 years than it did +in the last 40.The next 40 years will bring us some wonderful things. I don't +mean to imply they're all to be avoided. Alcohol is a dangerous +drug, but I'd rather live in a world with wine than one without. +Most people can coexist with alcohol; but you have to be careful. +More things we like will mean more things we have to be careful +about.Most people won't, unfortunately. Which means that as the world +becomes more addictive, the two senses in which one can live a +normal life will be driven ever further apart. One sense of "normal" +is statistically normal: what everyone else does. The other is the +sense we mean when we talk about the normal operating range of a +piece of machinery: what works best.These two senses are already quite far apart. Already someone +trying to live well would seem eccentrically abstemious in most of +the US. That phenomenon is only going to become more pronounced. +You can probably take it as a rule of thumb from now on that if +people don't think you're weird, you're living badly.Societies eventually develop antibodies to addictive new things. +I've seen that happen with cigarettes. When cigarettes first +appeared, they spread the way an infectious disease spreads through +a previously isolated population. Smoking rapidly became a +(statistically) normal thing. There were ashtrays everywhere. We +had ashtrays in our house when I was a kid, even though neither of +my parents smoked. You had to for guests.As knowledge spread about the dangers of smoking, customs changed. +In the last 20 years, smoking has been transformed from something +that seemed totally normal into a rather seedy habit: from something +movie stars did in publicity shots to something small huddles of +addicts do outside the doors of office buildings. A lot of the +change was due to legislation, of course, but the legislation +couldn't have happened if customs hadn't already changed.It took a while though—on the order of 100 years. And unless the +rate at which social antibodies evolve can increase to match the +accelerating rate at which technological progress throws off new +addictions, we'll be increasingly unable to rely on customs to +protect us. +[3] +Unless we want to be canaries in the coal mine +of each new addiction—the people whose sad example becomes a +lesson to future generations—we'll have to figure out for ourselves +what to avoid and how. It will actually become a reasonable strategy +(or a more reasonable strategy) to suspect +everything new.In fact, even that won't be enough. We'll have to worry not just +about new things, but also about existing things becoming more +addictive. That's what bit me. I've avoided most addictions, but +the Internet got me because it became addictive while I was using +it. +[4]Most people I know have problems with Internet addiction. We're +all trying to figure out our own customs for getting free of it. +That's why I don't have an iPhone, for example; the last thing I +want is for the Internet to follow me out into the world. +[5] +My latest trick is taking long hikes. I used to think running was a +better form of exercise than hiking because it took less time. Now +the slowness of hiking seems an advantage, because the longer I +spend on the trail, the longer I have to think without interruption.Sounds pretty eccentric, doesn't it? It always will when you're +trying to solve problems where there are no customs yet to guide +you. Maybe I can't plead Occam's razor; maybe I'm simply eccentric. +But if I'm right about the acceleration of addictiveness, then this +kind of lonely squirming to avoid it will increasingly be the fate +of anyone who wants to get things done. We'll increasingly be +defined by what we say no to. +Notes[1] +Could you restrict technological progress to areas where you +wanted it? Only in a limited way, without becoming a police state. +And even then your restrictions would have undesirable side effects. +"Good" and "bad" technological progress aren't sharply differentiated, +so you'd find you couldn't slow the latter without also slowing the +former. And in any case, as Prohibition and the "war on drugs" +show, bans often do more harm than good.[2] +Technology has always been accelerating. By Paleolithic +standards, technology evolved at a blistering pace in the Neolithic +period.[3] +Unless we mass produce social customs. I suspect the recent +resurgence of evangelical Christianity in the US is partly a reaction +to drugs. In desperation people reach for the sledgehammer; if +their kids won't listen to them, maybe they'll listen to God. But +that solution has broader consequences than just getting kids to +say no to drugs. You end up saying no to +science as well. +I worry we may be heading for a future in which only a few people +plot their own itinerary through no-land, while everyone else books +a package tour. Or worse still, has one booked for them by the +government.[4] +People commonly use the word "procrastination" to describe +what they do on the Internet. It seems to me too mild to describe +what's happening as merely not-doing-work. We don't call it +procrastination when someone gets drunk instead of working.[5] +Several people have told me they like the iPad because it +lets them bring the Internet into situations where a laptop would +be too conspicuous. In other words, it's a hip flask. (This is +true of the iPhone too, of course, but this advantage isn't as +obvious because it reads as a phone, and everyone's used to those.)Thanks to Sam Altman, Patrick Collison, Jessica Livingston, and +Robert Morris for reading drafts of this. \ No newline at end of file diff --git a/data/PaulGrahamEssaysLarge/aord.txt b/data/PaulGrahamEssaysLarge/aord.txt new file mode 100644 index 0000000..fb6547a --- /dev/null +++ b/data/PaulGrahamEssaysLarge/aord.txt @@ -0,0 +1,126 @@ +October 2015When I talk to a startup that's been operating for more than 8 or +9 months, the first thing I want to know is almost always the same. +Assuming their expenses remain constant and their revenue growth +is what it has been over the last several months, do they make it to +profitability on the money they have left? Or to put it more +dramatically, by default do they live or die?The startling thing is how often the founders themselves don't know. +Half the founders I talk to don't know whether they're default alive +or default dead.If you're among that number, Trevor Blackwell has made a handy +calculator you can use to find out.The reason I want to know first whether a startup is default alive +or default dead is that the rest of the conversation depends on the +answer. If the company is default alive, we can talk about ambitious +new things they could do. If it's default dead, we probably need +to talk about how to save it. We know the current trajectory ends +badly. How can they get off that trajectory?Why do so few founders know whether they're default alive or default +dead? Mainly, I think, because they're not used to asking that. +It's not a question that makes sense to ask early on, any more than +it makes sense to ask a 3 year old how he plans to support +himself. But as the company grows older, the question switches from +meaningless to critical. That kind of switch often takes people +by surprise.I propose the following solution: instead of starting to ask too +late whether you're default alive or default dead, start asking too +early. It's hard to say precisely when the question switches +polarity. But it's probably not that dangerous to start worrying +too early that you're default dead, whereas it's very dangerous to +start worrying too late.The reason is a phenomenon I wrote about earlier: the +fatal pinch. +The fatal pinch is default dead + slow growth + not enough +time to fix it. And the way founders end up in it is by not realizing +that's where they're headed.There is another reason founders don't ask themselves whether they're +default alive or default dead: they assume it will be easy to raise +more money. But that assumption is often false, and worse still, the +more you depend on it, the falser it becomes.Maybe it will help to separate facts from hopes. Instead of thinking +of the future with vague optimism, explicitly separate the components. +Say "We're default dead, but we're counting on investors to save +us." Maybe as you say that, it will set off the same alarms in your +head that it does in mine. And if you set off the alarms sufficiently +early, you may be able to avoid the fatal pinch.It would be safe to be default dead if you could count on investors +saving you. As a rule their interest is a function of +growth. If you have steep revenue growth, say over 5x a year, you +can start to count on investors being interested even if you're not +profitable. +[1] +But investors are so fickle that you can never +do more than start to count on them. Sometimes something about your +business will spook investors even if your growth is great. So no +matter how good your growth is, you can never safely treat fundraising +as more than a plan A. You should always have a plan B as well: you +should know (as in write down) precisely what you'll need to do to +survive if you can't raise more money, and precisely when you'll +have to switch to plan B if plan A isn't working.In any case, growing fast versus operating cheaply is far from the +sharp dichotomy many founders assume it to be. In practice there +is surprisingly little connection between how much a startup spends +and how fast it grows. When a startup grows fast, it's usually +because the product hits a nerve, in the sense of hitting some big +need straight on. When a startup spends a lot, it's usually because +the product is expensive to develop or sell, or simply because +they're wasteful.If you're paying attention, you'll be asking at this point not just +how to avoid the fatal pinch, but how to avoid being default dead. +That one is easy: don't hire too fast. Hiring too fast is by far +the biggest killer of startups that raise money. +[2]Founders tell themselves they need to hire in order to grow. But +most err on the side of overestimating this need rather than +underestimating it. Why? Partly because there's so much work to +do. Naive founders think that if they can just hire enough +people, it will all get done. Partly because successful startups have +lots of employees, so it seems like that's what one does in order +to be successful. In fact the large staffs of successful startups +are probably more the effect of growth than the cause. And +partly because when founders have slow growth they don't want to +face what is usually the real reason: the product is not appealing +enough.Plus founders who've just raised money are often encouraged to +overhire by the VCs who funded them. Kill-or-cure strategies are +optimal for VCs because they're protected by the portfolio effect. +VCs want to blow you up, in one sense of the phrase or the other. +But as a founder your incentives are different. You want above all +to survive. +[3]Here's a common way startups die. They make something moderately +appealing and have decent initial growth. They raise their first +round fairly easily, because the founders seem smart and the idea +sounds plausible. But because the product is only moderately +appealing, growth is ok but not great. The founders convince +themselves that hiring a bunch of people is the way to boost growth. +Their investors agree. But (because the product is only moderately +appealing) the growth never comes. Now they're rapidly running out +of runway. They hope further investment will save them. But because +they have high expenses and slow growth, they're now unappealing +to investors. They're unable to raise more, and the company dies.What the company should have done is address the fundamental problem: +that the product is only moderately appealing. Hiring people is +rarely the way to fix that. More often than not it makes it harder. +At this early stage, the product needs to evolve more than to be +"built out," and that's usually easier with fewer people. +[4]Asking whether you're default alive or default dead may save you +from this. Maybe the alarm bells it sets off will counteract the +forces that push you to overhire. Instead you'll be compelled to +seek growth in other ways. For example, by doing +things that don't scale, or by redesigning the product in the +way only founders can. +And for many if not most startups, these paths to growth will be +the ones that actually work.Airbnb waited 4 months after raising money at the end of Y Combinator +before they hired their first employee. In the meantime the founders +were terribly overworked. But they were overworked evolving Airbnb +into the astonishingly successful organism it is now.Notes[1] +Steep usage growth will also interest investors. Revenue +will ultimately be a constant multiple of usage, so x% usage growth +predicts x% revenue growth. But in practice investors discount +merely predicted revenue, so if you're measuring usage you need a +higher growth rate to impress investors.[2] +Startups that don't raise money are saved from hiring too +fast because they can't afford to. But that doesn't mean you should +avoid raising money in order to avoid this problem, any more than +that total abstinence is the only way to avoid becoming an alcoholic.[3] +I would not be surprised if VCs' tendency to push founders +to overhire is not even in their own interest. They don't know how +many of the companies that get killed by overspending might have +done well if they'd survived. My guess is a significant number.[4] +After reading a draft, Sam Altman wrote:"I think you should make the hiring point more strongly. I think +it's roughly correct to say that YC's most successful companies +have never been the fastest to hire, and one of the marks of a great +founder is being able to resist this urge."Paul Buchheit adds:"A related problem that I see a lot is premature scaling—founders +take a small business that isn't really working (bad unit economics, +typically) and then scale it up because they want impressive growth +numbers. This is similar to over-hiring in that it makes the business +much harder to fix once it's big, plus they are bleeding cash really +fast." +Thanks to Sam Altman, Paul Buchheit, Joe Gebbia, Jessica Livingston, +and Geoff Ralston for reading drafts of this. \ No newline at end of file diff --git a/data/PaulGrahamEssaysLarge/apple.txt b/data/PaulGrahamEssaysLarge/apple.txt new file mode 100644 index 0000000..7a498d0 --- /dev/null +++ b/data/PaulGrahamEssaysLarge/apple.txt @@ -0,0 +1,201 @@ + + +Want to start a startup? Get funded by +Y Combinator. + + + + +November 2009I don't think Apple realizes how badly the App Store approval process +is broken. Or rather, I don't think they realize how much it matters +that it's broken.The way Apple runs the App Store has harmed their reputation with +programmers more than anything else they've ever done. +Their reputation with programmers used to be great. +It used to be the most common complaint you heard +about Apple was that their fans admired them too uncritically. +The App Store has changed that. Now a lot of programmers +have started to see Apple as evil.How much of the goodwill Apple once had with programmers have they +lost over the App Store? A third? Half? And that's just so far. +The App Store is an ongoing karma leak.* * *How did Apple get into this mess? Their fundamental problem is +that they don't understand software.They treat iPhone apps the way they treat the music they sell through +iTunes. Apple is the channel; they own the user; if you want to +reach users, you do it on their terms. The record labels agreed, +reluctantly. But this model doesn't work for software. It doesn't +work for an intermediary to own the user. The software business +learned that in the early 1980s, when companies like VisiCorp showed +that although the words "software" and "publisher" fit together, +the underlying concepts don't. Software isn't like music or books. +It's too complicated for a third party to act as an intermediary +between developer and user. And yet that's what Apple is trying +to be with the App Store: a software publisher. And a particularly +overreaching one at that, with fussy tastes and a rigidly enforced +house style.If software publishing didn't work in 1980, it works even less now +that software development has evolved from a small number of big +releases to a constant stream of small ones. But Apple doesn't +understand that either. Their model of product development derives +from hardware. They work on something till they think it's finished, +then they release it. You have to do that with hardware, but because +software is so easy to change, its design can benefit from evolution. +The standard way to develop applications now is to launch fast and +iterate. Which means it's a disaster to have long, random delays +each time you release a new version.Apparently Apple's attitude is that developers should be more careful +when they submit a new version to the App Store. They would say +that. But powerful as they are, they're not powerful enough to +turn back the evolution of technology. Programmers don't use +launch-fast-and-iterate out of laziness. They use it because it +yields the best results. By obstructing that process, Apple is +making them do bad work, and programmers hate that as much as Apple +would.How would Apple like it if when they discovered a serious bug in +OS X, instead of releasing a software update immediately, they had +to submit their code to an intermediary who sat on it for a month +and then rejected it because it contained an icon they didn't like?By breaking software development, Apple gets the opposite of what +they intended: the version of an app currently available in the App +Store tends to be an old and buggy one. One developer told me: + + As a result of their process, the App Store is full of half-baked + applications. I make a new version almost every day that I release + to beta users. The version on the App Store feels old and crappy. + I'm sure that a lot of developers feel this way: One emotion is + "I'm not really proud about what's in the App Store", and it's + combined with the emotion "Really, it's Apple's fault." + +Another wrote: + + I believe that they think their approval process helps users by + ensuring quality. In reality, bugs like ours get through all the + time and then it can take 4-8 weeks to get that bug fix approved, + leaving users to think that iPhone apps sometimes just don't work. + Worse for Apple, these apps work just fine on other platforms + that have immediate approval processes. + +Actually I suppose Apple has a third misconception: that all the +complaints about App Store approvals are not a serious problem. +They must hear developers complaining. But partners and suppliers +are always complaining. It would be a bad sign if they weren't; +it would mean you were being too easy on them. Meanwhile the iPhone +is selling better than ever. So why do they need to fix anything?They get away with maltreating developers, in the short term, because +they make such great hardware. I just bought a new 27" iMac a +couple days ago. It's fabulous. The screen's too shiny, and the +disk is surprisingly loud, but it's so beautiful that you can't +make yourself care.So I bought it, but I bought it, for the first time, with misgivings. +I felt the way I'd feel buying something made in a country with a +bad human rights record. That was new. In the past when I bought +things from Apple it was an unalloyed pleasure. Oh boy! They make +such great stuff. This time it felt like a Faustian bargain. They +make such great stuff, but they're such assholes. Do I really want +to support this company?* * *Should Apple care what people like me think? What difference does +it make if they alienate a small minority of their users?There are a couple reasons they should care. One is that these +users are the people they want as employees. If your company seems +evil, the best programmers won't work for you. That hurt Microsoft +a lot starting in the 90s. Programmers started to feel sheepish +about working there. It seemed like selling out. When people from +Microsoft were talking to other programmers and they mentioned where +they worked, there were a lot of self-deprecating jokes about having +gone over to the dark side. But the real problem for Microsoft +wasn't the embarrassment of the people they hired. It was the +people they never got. And you know who got them? Google and +Apple. If Microsoft was the Empire, they were the Rebel Alliance. +And it's largely because they got more of the best people that +Google and Apple are doing so much better than Microsoft today.Why are programmers so fussy about their employers' morals? Partly +because they can afford to be. The best programmers can work +wherever they want. They don't have to work for a company they +have qualms about.But the other reason programmers are fussy, I think, is that evil +begets stupidity. An organization that wins by exercising power +starts to lose the ability to win by doing better work. And it's +not fun for a smart person to work in a place where the best ideas +aren't the ones that win. I think the reason Google embraced "Don't +be evil" so eagerly was not so much to impress the outside world +as to inoculate themselves against arrogance. +[1]That has worked for Google so far. They've become more +bureaucratic, but otherwise they seem to have held true to their +original principles. With Apple that seems less the case. When you +look at the famous +1984 ad +now, it's easier to imagine Apple as the +dictator on the screen than the woman with the hammer. +[2] +In fact, if you read the dictator's speech it sounds uncannily like a +prophecy of the App Store. + + We have triumphed over the unprincipled dissemination of facts.We have created, for the first time in all history, a garden of + pure ideology, where each worker may bloom secure from the pests + of contradictory and confusing truths. + +The other reason Apple should care what programmers think of them +is that when you sell a platform, developers make or break you. If +anyone should know this, Apple should. VisiCalc made the Apple II.And programmers build applications for the platforms they use. Most +applications—most startups, probably—grow out of personal projects. +Apple itself did. Apple made microcomputers because that's what +Steve Wozniak wanted for himself. He couldn't have afforded a +minicomputer. +[3] + Microsoft likewise started out making interpreters +for little microcomputers because +Bill Gates and Paul Allen were interested in using them. It's a +rare startup that doesn't build something the founders use.The main reason there are so many iPhone apps is that so many programmers +have iPhones. They may know, because they read it in an article, +that Blackberry has such and such market share. But in practice +it's as if RIM didn't exist. If they're going to build something, +they want to be able to use it themselves, and that means building +an iPhone app.So programmers continue to develop iPhone apps, even though Apple +continues to maltreat them. They're like someone stuck in an abusive +relationship. They're so attracted to the iPhone that they can't +leave. But they're looking for a way out. One wrote: + + While I did enjoy developing for the iPhone, the control they + place on the App Store does not give me the drive to develop + applications as I would like. In fact I don't intend to make any + more iPhone applications unless absolutely necessary. +[4] + +Can anything break this cycle? No device I've seen so far could. +Palm and RIM haven't a hope. The only credible contender is Android. +But Android is an orphan; Google doesn't really care about it, not +the way Apple cares about the iPhone. Apple cares about the iPhone +the way Google cares about search.* * *Is the future of handheld devices one locked down by Apple? It's +a worrying prospect. It would be a bummer to have another grim +monoculture like we had in the 1990s. In 1995, writing software +for end users was effectively identical with writing Windows +applications. Our horror at that prospect was the single biggest +thing that drove us to start building web apps.At least we know now what it would take to break Apple's lock. +You'd have to get iPhones out of programmers' hands. If programmers +used some other device for mobile web access, they'd start to develop +apps for that instead.How could you make a device programmers liked better than the iPhone? +It's unlikely you could make something better designed. Apple +leaves no room there. So this alternative device probably couldn't +win on general appeal. It would have to win by virtue of some +appeal it had to programmers specifically.One way to appeal to programmers is with software. If you +could think of an application programmers had to have, but that +would be impossible in the circumscribed world of the iPhone, +you could presumably get them to switch.That would definitely happen if programmers started to use handhelds +as development machines—if handhelds displaced laptops the +way laptops displaced desktops. You need more control of a development +machine than Apple will let you have over an iPhone.Could anyone make a device that you'd carry around in your pocket +like a phone, and yet would also work as a development machine? +It's hard to imagine what it would look like. But I've learned +never to say never about technology. A phone-sized device that +would work as a development machine is no more miraculous by present +standards than the iPhone itself would have seemed by the standards +of 1995.My current development machine is a MacBook Air, which I use with +an external monitor and keyboard in my office, and by itself when +traveling. If there was a version half the size I'd prefer it. +That still wouldn't be small enough to carry around everywhere like +a phone, but we're within a factor of 4 or so. Surely that gap is +bridgeable. In fact, let's make it an +RFS. Wanted: +Woman with hammer.Notes[1] +When Google adopted "Don't be evil," they were still so small +that no one would have expected them to be, yet. +[2] +The dictator in the 1984 ad isn't Microsoft, incidentally; +it's IBM. IBM seemed a lot more frightening in those days, but +they were friendlier to developers than Apple is now.[3] +He couldn't even afford a monitor. That's why the Apple +I used a TV as a monitor.[4] +Several people I talked to mentioned how much they liked the +iPhone SDK. The problem is not Apple's products but their policies. +Fortunately policies are software; Apple can change them instantly +if they want to. Handy that, isn't it?Thanks to Sam Altman, Trevor Blackwell, Ross Boucher, +James Bracy, Gabor Cselle, +Patrick Collison, Jason Freedman, John Gruber, Joe Hewitt, Jessica Livingston, +Robert Morris, Teng Siong Ong, Nikhil Pandit, Savraj Singh, and Jared Tame for reading drafts of this. \ No newline at end of file diff --git a/data/PaulGrahamEssaysLarge/avg.txt b/data/PaulGrahamEssaysLarge/avg.txt new file mode 100644 index 0000000..a3ec04d --- /dev/null +++ b/data/PaulGrahamEssaysLarge/avg.txt @@ -0,0 +1,375 @@ + + +Want to start a startup? Get funded by +Y Combinator. + + + + +April 2001, rev. April 2003(This article is derived from a talk given at the 2001 Franz +Developer Symposium.) +In the summer of 1995, my friend Robert Morris and I +started a startup called +Viaweb. +Our plan was to write +software that would let end users build online stores. +What was novel about this software, at the time, was +that it ran on our server, using ordinary Web pages +as the interface.A lot of people could have been having this idea at the +same time, of course, but as far as I know, Viaweb was +the first Web-based application. It seemed such +a novel idea to us that we named the company after it: +Viaweb, because our software worked via the Web, +instead of running on your desktop computer.Another unusual thing about this software was that it +was written primarily in a programming language called +Lisp. It was one of the first big end-user +applications to be written in Lisp, which up till then +had been used mostly in universities and research labs. [1]The Secret WeaponEric Raymond has written an essay called "How to Become a Hacker," +and in it, among other things, he tells would-be hackers what +languages they should learn. He suggests starting with Python and +Java, because they are easy to learn. The serious hacker will also +want to learn C, in order to hack Unix, and Perl for system +administration and cgi scripts. Finally, the truly serious hacker +should consider learning Lisp: + + Lisp is worth learning for the profound enlightenment experience + you will have when you finally get it; that experience will make + you a better programmer for the rest of your days, even if you + never actually use Lisp itself a lot. + +This is the same argument you tend to hear for learning Latin. It +won't get you a job, except perhaps as a classics professor, but +it will improve your mind, and make you a better writer in languages +you do want to use, like English.But wait a minute. This metaphor doesn't stretch that far. The +reason Latin won't get you a job is that no one speaks it. If you +write in Latin, no one can understand you. But Lisp is a computer +language, and computers speak whatever language you, the programmer, +tell them to.So if Lisp makes you a better programmer, like he says, why wouldn't +you want to use it? If a painter were offered a brush that would +make him a better painter, it seems to me that he would want to +use it in all his paintings, wouldn't he? I'm not trying to make +fun of Eric Raymond here. On the whole, his advice is good. What +he says about Lisp is pretty much the conventional wisdom. But +there is a contradiction in the conventional wisdom: Lisp will +make you a better programmer, and yet you won't use it.Why not? Programming languages are just tools, after all. If Lisp +really does yield better programs, you should use it. And if it +doesn't, then who needs it?This is not just a theoretical question. Software is a very +competitive business, prone to natural monopolies. A company that +gets software written faster and better will, all other things +being equal, put its competitors out of business. And when you're +starting a startup, you feel this very keenly. Startups tend to +be an all or nothing proposition. You either get rich, or you get +nothing. In a startup, if you bet on the wrong technology, your +competitors will crush you.Robert and I both knew Lisp well, and we couldn't see any reason +not to trust our instincts and go with Lisp. We knew that everyone +else was writing their software in C++ or Perl. But we also knew +that that didn't mean anything. If you chose technology that way, +you'd be running Windows. When you choose technology, you have to +ignore what other people are doing, and consider only what will +work the best.This is especially true in a startup. In a big company, you can +do what all the other big companies are doing. But a startup can't +do what all the other startups do. I don't think a lot of people +realize this, even in startups.The average big company grows at about ten percent a year. So if +you're running a big company and you do everything the way the +average big company does it, you can expect to do as well as the +average big company-- that is, to grow about ten percent a year.The same thing will happen if you're running a startup, of course. +If you do everything the way the average startup does it, you should +expect average performance. The problem here is, average performance +means that you'll go out of business. The survival rate for startups +is way less than fifty percent. So if you're running a startup, +you had better be doing something odd. If not, you're in trouble.Back in 1995, we knew something that I don't think our competitors +understood, and few understand even now: when you're writing +software that only has to run on your own servers, you can use +any language you want. When you're writing desktop software, +there's a strong bias toward writing applications in the same +language as the operating system. Ten years ago, writing applications +meant writing applications in C. But with Web-based software, +especially when you have the source code of both the language and +the operating system, you can use whatever language you want.This new freedom is a double-edged sword, however. Now that you +can use any language, you have to think about which one to use. +Companies that try to pretend nothing has changed risk finding that +their competitors do not.If you can use any language, which do you use? We chose Lisp. +For one thing, it was obvious that rapid development would be +important in this market. We were all starting from scratch, so +a company that could get new features done before its competitors +would have a big advantage. We knew Lisp was a really good language +for writing software quickly, and server-based applications magnify +the effect of rapid development, because you can release software +the minute it's done.If other companies didn't want to use Lisp, so much the better. +It might give us a technological edge, and we needed all the help +we could get. When we started Viaweb, we had no experience in +business. We didn't know anything about marketing, or hiring +people, or raising money, or getting customers. Neither of us had +ever even had what you would call a real job. The only thing we +were good at was writing software. We hoped that would save us. +Any advantage we could get in the software department, we would +take.So you could say that using Lisp was an experiment. Our hypothesis +was that if we wrote our software in Lisp, we'd be able to get +features done faster than our competitors, and also to do things +in our software that they couldn't do. And because Lisp was so +high-level, we wouldn't need a big development team, so our costs +would be lower. If this were so, we could offer a better product +for less money, and still make a profit. We would end up getting +all the users, and our competitors would get none, and eventually +go out of business. That was what we hoped would happen, anyway.What were the results of this experiment? Somewhat surprisingly, +it worked. We eventually had many competitors, on the order of +twenty to thirty of them, but none of their software could compete +with ours. We had a wysiwyg online store builder that ran on the +server and yet felt like a desktop application. Our competitors +had cgi scripts. And we were always far ahead of them in features. +Sometimes, in desperation, competitors would try to introduce +features that we didn't have. But with Lisp our development cycle +was so fast that we could sometimes duplicate a new feature within +a day or two of a competitor announcing it in a press release. By +the time journalists covering the press release got round to calling +us, we would have the new feature too.It must have seemed to our competitors that we had some kind of +secret weapon-- that we were decoding their Enigma traffic or +something. In fact we did have a secret weapon, but it was simpler +than they realized. No one was leaking news of their features to +us. We were just able to develop software faster than anyone +thought possible.When I was about nine I happened to get hold of a copy of The Day +of the Jackal, by Frederick Forsyth. The main character is an +assassin who is hired to kill the president of France. The assassin +has to get past the police to get up to an apartment that overlooks +the president's route. He walks right by them, dressed up as an +old man on crutches, and they never suspect him.Our secret weapon was similar. We wrote our software in a weird +AI language, with a bizarre syntax full of parentheses. For years +it had annoyed me to hear Lisp described that way. But now it +worked to our advantage. In business, there is nothing more valuable +than a technical advantage your competitors don't understand. In +business, as in war, surprise is worth as much as force.And so, I'm a little embarrassed to say, I never said anything +publicly about Lisp while we were working on Viaweb. We never +mentioned it to the press, and if you searched for Lisp on our Web +site, all you'd find were the titles of two books in my bio. This +was no accident. A startup should give its competitors as little +information as possible. If they didn't know what language our +software was written in, or didn't care, I wanted to keep it that +way.[2]The people who understood our technology best were the customers. +They didn't care what language Viaweb was written in either, but +they noticed that it worked really well. It let them build great +looking online stores literally in minutes. And so, by word of +mouth mostly, we got more and more users. By the end of 1996 we +had about 70 stores online. At the end of 1997 we had 500. Six +months later, when Yahoo bought us, we had 1070 users. Today, as +Yahoo Store, this software continues to dominate its market. It's +one of the more profitable pieces of Yahoo, and the stores built +with it are the foundation of Yahoo Shopping. I left Yahoo in +1999, so I don't know exactly how many users they have now, but +the last I heard there were about 20,000. +The Blub ParadoxWhat's so great about Lisp? And if Lisp is so great, why doesn't +everyone use it? These sound like rhetorical questions, but actually +they have straightforward answers. Lisp is so great not because +of some magic quality visible only to devotees, but because it is +simply the most powerful language available. And the reason everyone +doesn't use it is that programming languages are not merely +technologies, but habits of mind as well, and nothing changes +slower. Of course, both these answers need explaining.I'll begin with a shockingly controversial statement: programming +languages vary in power.Few would dispute, at least, that high level languages are more +powerful than machine language. Most programmers today would agree +that you do not, ordinarily, want to program in machine language. +Instead, you should program in a high-level language, and have a +compiler translate it into machine language for you. This idea is +even built into the hardware now: since the 1980s, instruction sets +have been designed for compilers rather than human programmers.Everyone knows it's a mistake to write your whole program by hand +in machine language. What's less often understood is that there +is a more general principle here: that if you have a choice of +several languages, it is, all other things being equal, a mistake +to program in anything but the most powerful one. [3]There are many exceptions to this rule. If you're writing a program +that has to work very closely with a program written in a certain +language, it might be a good idea to write the new program in the +same language. If you're writing a program that only has to do +something very simple, like number crunching or bit manipulation, +you may as well use a less abstract language, especially since it +may be slightly faster. And if you're writing a short, throwaway +program, you may be better off just using whatever language has +the best library functions for the task. But in general, for +application software, you want to be using the most powerful +(reasonably efficient) language you can get, and using anything +else is a mistake, of exactly the same kind, though possibly in a +lesser degree, as programming in machine language.You can see that machine language is very low level. But, at least +as a kind of social convention, high-level languages are often all +treated as equivalent. They're not. Technically the term "high-level +language" doesn't mean anything very definite. There's no dividing +line with machine languages on one side and all the high-level +languages on the other. Languages fall along a continuum [4] of +abstractness, from the most powerful all the way down to machine +languages, which themselves vary in power.Consider Cobol. Cobol is a high-level language, in the sense that +it gets compiled into machine language. Would anyone seriously +argue that Cobol is equivalent in power to, say, Python? It's +probably closer to machine language than Python.Or how about Perl 4? Between Perl 4 and Perl 5, lexical closures +got added to the language. Most Perl hackers would agree that Perl +5 is more powerful than Perl 4. But once you've admitted that, +you've admitted that one high level language can be more powerful +than another. And it follows inexorably that, except in special +cases, you ought to use the most powerful you can get.This idea is rarely followed to its conclusion, though. After a +certain age, programmers rarely switch languages voluntarily. +Whatever language people happen to be used to, they tend to consider +just good enough.Programmers get very attached to their favorite languages, and I +don't want to hurt anyone's feelings, so to explain this point I'm +going to use a hypothetical language called Blub. Blub falls right +in the middle of the abstractness continuum. It is not the most +powerful language, but it is more powerful than Cobol or machine +language.And in fact, our hypothetical Blub programmer wouldn't use either +of them. Of course he wouldn't program in machine language. That's +what compilers are for. And as for Cobol, he doesn't know how +anyone can get anything done with it. It doesn't even have x (Blub +feature of your choice).As long as our hypothetical Blub programmer is looking down the +power continuum, he knows he's looking down. Languages less powerful +than Blub are obviously less powerful, because they're missing some +feature he's used to. But when our hypothetical Blub programmer +looks in the other direction, up the power continuum, he doesn't +realize he's looking up. What he sees are merely weird languages. +He probably considers them about equivalent in power to Blub, but +with all this other hairy stuff thrown in as well. Blub is good +enough for him, because he thinks in Blub.When we switch to the point of view of a programmer using any of +the languages higher up the power continuum, however, we find that +he in turn looks down upon Blub. How can you get anything done in +Blub? It doesn't even have y.By induction, the only programmers in a position to see all the +differences in power between the various languages are those who +understand the most powerful one. (This is probably what Eric +Raymond meant about Lisp making you a better programmer.) You can't +trust the opinions of the others, because of the Blub paradox: +they're satisfied with whatever language they happen to use, because +it dictates the way they think about programs.I know this from my own experience, as a high school kid writing +programs in Basic. That language didn't even support recursion. +It's hard to imagine writing programs without using recursion, but +I didn't miss it at the time. I thought in Basic. And I was a +whiz at it. Master of all I surveyed.The five languages that Eric Raymond recommends to hackers fall at +various points on the power continuum. Where they fall relative +to one another is a sensitive topic. What I will say is that I +think Lisp is at the top. And to support this claim I'll tell you +about one of the things I find missing when I look at the other +four languages. How can you get anything done in them, I think, +without macros? [5]Many languages have something called a macro. But Lisp macros are +unique. And believe it or not, what they do is related to the +parentheses. The designers of Lisp didn't put all those parentheses +in the language just to be different. To the Blub programmer, Lisp +code looks weird. But those parentheses are there for a reason. +They are the outward evidence of a fundamental difference between +Lisp and other languages.Lisp code is made out of Lisp data objects. And not in the trivial +sense that the source files contain characters, and strings are +one of the data types supported by the language. Lisp code, after +it's read by the parser, is made of data structures that you can +traverse.If you understand how compilers work, what's really going on is +not so much that Lisp has a strange syntax as that Lisp has no +syntax. You write programs in the parse trees that get generated +within the compiler when other languages are parsed. But these +parse trees are fully accessible to your programs. You can write +programs that manipulate them. In Lisp, these programs are called +macros. They are programs that write programs.Programs that write programs? When would you ever want to do that? +Not very often, if you think in Cobol. All the time, if you think +in Lisp. It would be convenient here if I could give an example +of a powerful macro, and say there! how about that? But if I did, +it would just look like gibberish to someone who didn't know Lisp; +there isn't room here to explain everything you'd need to know to +understand what it meant. In +Ansi Common Lisp I tried to move +things along as fast as I could, and even so I didn't get to macros +until page 160.But I think I can give a kind of argument that might be convincing. +The source code of the Viaweb editor was probably about 20-25% +macros. Macros are harder to write than ordinary Lisp functions, +and it's considered to be bad style to use them when they're not +necessary. So every macro in that code is there because it has to +be. What that means is that at least 20-25% of the code in this +program is doing things that you can't easily do in any other +language. However skeptical the Blub programmer might be about my +claims for the mysterious powers of Lisp, this ought to make him +curious. We weren't writing this code for our own amusement. We +were a tiny startup, programming as hard as we could in order to +put technical barriers between us and our competitors.A suspicious person might begin to wonder if there was some +correlation here. A big chunk of our code was doing things that +are very hard to do in other languages. The resulting software +did things our competitors' software couldn't do. Maybe there was +some kind of connection. I encourage you to follow that thread. +There may be more to that old man hobbling along on his crutches +than meets the eye.Aikido for StartupsBut I don't expect to convince anyone +(over 25) +to go out and learn +Lisp. The purpose of this article is not to change anyone's mind, +but to reassure people already interested in using Lisp-- people +who know that Lisp is a powerful language, but worry because it +isn't widely used. In a competitive situation, that's an advantage. +Lisp's power is multiplied by the fact that your competitors don't +get it.If you think of using Lisp in a startup, you shouldn't worry that +it isn't widely understood. You should hope that it stays that +way. And it's likely to. It's the nature of programming languages +to make most people satisfied with whatever they currently use. +Computer hardware changes so much faster than personal habits that +programming practice is usually ten to twenty years behind the +processor. At places like MIT they were writing programs in +high-level languages in the early 1960s, but many companies continued +to write code in machine language well into the 1980s. I bet a +lot of people continued to write machine language until the processor, +like a bartender eager to close up and go home, finally kicked them +out by switching to a risc instruction set.Ordinarily technology changes fast. But programming languages are +different: programming languages are not just technology, but what +programmers think in. They're half technology and half religion.[6] +And so the median language, meaning whatever language the median +programmer uses, moves as slow as an iceberg. Garbage collection, +introduced by Lisp in about 1960, is now widely considered to be +a good thing. Runtime typing, ditto, is growing in popularity. +Lexical closures, introduced by Lisp in the early 1970s, are now, +just barely, on the radar screen. Macros, introduced by Lisp in the +mid 1960s, are still terra incognita.Obviously, the median language has enormous momentum. I'm not +proposing that you can fight this powerful force. What I'm proposing +is exactly the opposite: that, like a practitioner of Aikido, you +can use it against your opponents.If you work for a big company, this may not be easy. You will have +a hard time convincing the pointy-haired boss to let you build +things in Lisp, when he has just read in the paper that some other +language is poised, like Ada was twenty years ago, to take over +the world. But if you work for a startup that doesn't have +pointy-haired bosses yet, you can, like we did, turn the Blub +paradox to your advantage: you can use technology that your +competitors, glued immovably to the median language, will never be +able to match.If you ever do find yourself working for a startup, here's a handy +tip for evaluating competitors. Read their job listings. Everything +else on their site may be stock photos or the prose equivalent, +but the job listings have to be specific about what they want, or +they'll get the wrong candidates.During the years we worked on Viaweb I read a lot of job descriptions. +A new competitor seemed to emerge out of the woodwork every month +or so. The first thing I would do, after checking to see if they +had a live online demo, was look at their job listings. After a +couple years of this I could tell which companies to worry about +and which not to. The more of an IT flavor the job descriptions +had, the less dangerous the company was. The safest kind were the +ones that wanted Oracle experience. You never had to worry about +those. You were also safe if they said they wanted C++ or Java +developers. If they wanted Perl or Python programmers, that would +be a bit frightening-- that's starting to sound like a company +where the technical side, at least, is run by real hackers. If I +had ever seen a job posting looking for Lisp hackers, I would have +been really worried. +Notes[1] Viaweb at first had two parts: the editor, written in Lisp, +which people used to build their sites, and the ordering system, +written in C, which handled orders. The first version was mostly +Lisp, because the ordering system was small. Later we added two +more modules, an image generator written in C, and a back-office +manager written mostly in Perl.In January 2003, Yahoo released a new version of the editor +written in C++ and Perl. It's hard to say whether the program is no +longer written in Lisp, though, because to translate this program +into C++ they literally had to write a Lisp interpreter: the source +files of all the page-generating templates are still, as far as I +know, Lisp code. (See Greenspun's Tenth Rule.)[2] Robert Morris says that I didn't need to be secretive, because +even if our competitors had known we were using Lisp, they wouldn't +have understood why: "If they were that smart they'd already be +programming in Lisp."[3] All languages are equally powerful in the sense of being Turing +equivalent, but that's not the sense of the word programmers care +about. (No one wants to program a Turing machine.) The kind of +power programmers care about may not be formally definable, but +one way to explain it would be to say that it refers to features +you could only get in the less powerful language by writing an +interpreter for the more powerful language in it. If language A +has an operator for removing spaces from strings and language B +doesn't, that probably doesn't make A more powerful, because you +can probably write a subroutine to do it in B. But if A supports, +say, recursion, and B doesn't, that's not likely to be something +you can fix by writing library functions.[4] Note to nerds: or possibly a lattice, narrowing toward the top; +it's not the shape that matters here but the idea that there is at +least a partial order.[5] It is a bit misleading to treat macros as a separate feature. +In practice their usefulness is greatly enhanced by other Lisp +features like lexical closures and rest parameters.[6] As a result, comparisons of programming languages either take +the form of religious wars or undergraduate textbooks so determinedly +neutral that they're really works of anthropology. People who +value their peace, or want tenure, avoid the topic. But the question +is only half a religious one; there is something there worth +studying, especially if you want to design new languages. \ No newline at end of file diff --git a/data/PaulGrahamEssaysLarge/before.txt b/data/PaulGrahamEssaysLarge/before.txt new file mode 100644 index 0000000..9d0e393 --- /dev/null +++ b/data/PaulGrahamEssaysLarge/before.txt @@ -0,0 +1,387 @@ + + +Want to start a startup? Get funded by +Y Combinator. + + + + +October 2014(This essay is derived from a guest lecture in Sam Altman's startup class at +Stanford. It's intended for college students, but much of it is +applicable to potential founders at other ages.)One of the advantages of having kids is that when you have to give +advice, you can ask yourself "what would I tell my own kids?" My +kids are little, but I can imagine what I'd tell them about startups +if they were in college, and that's what I'm going to tell you.Startups are very counterintuitive. I'm not sure why. Maybe it's +just because knowledge about them hasn't permeated our culture yet. +But whatever the reason, starting a startup is a task where you +can't always trust your instincts.It's like skiing in that way. When you first try skiing and you +want to slow down, your instinct is to lean back. But if you lean +back on skis you fly down the hill out of control. So part of +learning to ski is learning to suppress that impulse. Eventually +you get new habits, but at first it takes a conscious effort. At +first there's a list of things you're trying to remember as you +start down the hill.Startups are as unnatural as skiing, so there's a similar list for +startups. Here I'm going to give you the first part of it — the things +to remember if you want to prepare yourself to start a startup. +CounterintuitiveThe first item on it is the fact I already mentioned: that startups +are so weird that if you trust your instincts, you'll make a lot +of mistakes. If you know nothing more than this, you may at least +pause before making them.When I was running Y Combinator I used to joke that our function +was to tell founders things they would ignore. It's really true. +Batch after batch, the YC partners warn founders about mistakes +they're about to make, and the founders ignore them, and then come +back a year later and say "I wish we'd listened."Why do the founders ignore the partners' advice? Well, that's the +thing about counterintuitive ideas: they contradict your intuitions. +They seem wrong. So of course your first impulse is to disregard +them. And in fact my joking description is not merely the curse +of Y Combinator but part of its raison d'etre. If founders' instincts +already gave them the right answers, they wouldn't need us. You +only need other people to give you advice that surprises you. That's +why there are a lot of ski instructors and not many running +instructors. +[1]You can, however, trust your instincts about people. And in fact +one of the most common mistakes young founders make is not to +do that enough. They get involved with people who seem impressive, +but about whom they feel some misgivings personally. Later when +things blow up they say "I knew there was something off about him, +but I ignored it because he seemed so impressive."If you're thinking about getting involved with someone — as a +cofounder, an employee, an investor, or an acquirer — and you +have misgivings about them, trust your gut. If someone seems +slippery, or bogus, or a jerk, don't ignore it.This is one case where it pays to be self-indulgent. Work with +people you genuinely like, and you've known long enough to be sure. +ExpertiseThe second counterintuitive point is that it's not that important +to know a lot about startups. The way to succeed in a startup is +not to be an expert on startups, but to be an expert on your users +and the problem you're solving for them. +Mark Zuckerberg didn't succeed because he was an expert on startups. +He succeeded despite being a complete noob at startups, because he +understood his users really well.If you don't know anything about, say, how to raise an angel round, +don't feel bad on that account. That sort of thing you can learn +when you need to, and forget after you've done it.In fact, I worry it's not merely unnecessary to learn in great +detail about the mechanics of startups, but possibly somewhat +dangerous. If I met an undergrad who knew all about convertible +notes and employee agreements and (God forbid) class FF stock, I +wouldn't think "here is someone who is way ahead of their peers." +It would set off alarms. Because another of the characteristic +mistakes of young founders is to go through the motions of starting +a startup. They make up some plausible-sounding idea, raise money +at a good valuation, rent a cool office, hire a bunch of people. +From the outside that seems like what startups do. But the next +step after rent a cool office and hire a bunch of people is: gradually +realize how completely fucked they are, because while imitating all +the outward forms of a startup they have neglected the one thing +that's actually essential: making something people want. +GameWe saw this happen so often that we made up a name for it: playing +house. Eventually I realized why it was happening. The reason +young founders go through the motions of starting a startup is +because that's what they've been trained to do for their whole lives +up to that point. Think about what you have to do to get into +college, for example. Extracurricular activities, check. Even in +college classes most of the work is as artificial as running laps.I'm not attacking the educational system for being this way. There +will always be a certain amount of fakeness in the work you do when +you're being taught something, and if you measure their performance +it's inevitable that people will exploit the difference to the point +where much of what you're measuring is artifacts of the fakeness.I confess I did it myself in college. I found that in a lot of +classes there might only be 20 or 30 ideas that were the right shape +to make good exam questions. The way I studied for exams in these +classes was not (except incidentally) to master the material taught +in the class, but to make a list of potential exam questions and +work out the answers in advance. When I walked into the final, the +main thing I'd be feeling was curiosity about which of my questions +would turn up on the exam. It was like a game.It's not surprising that after being trained for their whole lives +to play such games, young founders' first impulse on starting a +startup is to try to figure out the tricks for winning at this new +game. Since fundraising appears to be the measure of success for +startups (another classic noob mistake), they always want to know what the +tricks are for convincing investors. We tell them the best way to +convince investors is to make a startup +that's actually doing well, meaning growing fast, and then simply +tell investors so. Then they want to know what the tricks are for +growing fast. And we have to tell them the best way to do that is +simply to make something people want.So many of the conversations YC partners have with young founders +begin with the founder asking "How do we..." and the partner replying +"Just..."Why do the founders always make things so complicated? The reason, +I realized, is that they're looking for the trick.So this is the third counterintuitive thing to remember about +startups: starting a startup is where gaming the system stops +working. Gaming the system may continue to work if you go to work +for a big company. Depending on how broken the company is, you can +succeed by sucking up to the right people, giving the impression +of productivity, and so on. +[2] +But that doesn't work with startups. +There is no boss to trick, only users, and all users care about is +whether your product does what they want. Startups are as impersonal +as physics. You have to make something people want, and you prosper +only to the extent you do.The dangerous thing is, faking does work to some degree on investors. +If you're super good at sounding like you know what you're talking +about, you can fool investors for at least one and perhaps even two +rounds of funding. But it's not in your interest to. The company +is ultimately doomed. All you're doing is wasting your own time +riding it down.So stop looking for the trick. There are tricks in startups, as +there are in any domain, but they are an order of magnitude less +important than solving the real problem. A founder who knows nothing +about fundraising but has made something users love will have an +easier time raising money than one who knows every trick in the +book but has a flat usage graph. And more importantly, the founder +who has made something users love is the one who will go on to +succeed after raising the money.Though in a sense it's bad news in that you're deprived of one of +your most powerful weapons, I think it's exciting that gaming the +system stops working when you start a startup. It's exciting that +there even exist parts of the world where you win by doing good +work. Imagine how depressing the world would be if it were all +like school and big companies, where you either have to spend a lot +of time on bullshit things or lose to people who do. +[3] +I would +have been delighted if I'd realized in college that there were parts +of the real world where gaming the system mattered less than others, +and a few where it hardly mattered at all. But there are, and this +variation is one of the most important things to consider when +you're thinking about your future. How do you win in each type of +work, and what would you like to win by doing? +[4] +All-ConsumingThat brings us to our fourth counterintuitive point: startups are +all-consuming. If you start a startup, it will take over your life +to a degree you cannot imagine. And if your startup succeeds, it +will take over your life for a long time: for several years at the +very least, maybe for a decade, maybe for the rest of your working +life. So there is a real opportunity cost here.Larry Page may seem to have an enviable life, but there are aspects +of it that are unenviable. Basically at 25 he started running as +fast as he could and it must seem to him that he hasn't stopped to +catch his breath since. Every day new shit happens in the Google +empire that only the CEO can deal with, and he, as CEO, has to deal +with it. If he goes on vacation for even a week, a whole week's +backlog of shit accumulates. And he has to bear this uncomplainingly, +partly because as the company's daddy he can never show fear or +weakness, and partly because billionaires get less than zero sympathy +if they talk about having difficult lives. Which has the strange +side effect that the difficulty of being a successful startup founder +is concealed from almost everyone except those who've done it.Y Combinator has now funded several companies that can be called +big successes, and in every single case the founders say the same +thing. It never gets any easier. The nature of the problems change. +You're worrying about construction delays at your London office +instead of the broken air conditioner in your studio apartment. +But the total volume of worry never decreases; if anything it +increases.Starting a successful startup is similar to having kids in that +it's like a button you push that changes your life irrevocably. +And while it's truly wonderful having kids, there are a lot of +things that are easier to do before you have them than after. Many +of which will make you a better parent when you do have kids. And +since you can delay pushing the button for a while, most people in +rich countries do.Yet when it comes to startups, a lot of people seem to think they're +supposed to start them while they're still in college. Are you +crazy? And what are the universities thinking? They go out of +their way to ensure their students are well supplied with contraceptives, +and yet they're setting up entrepreneurship programs and startup +incubators left and right.To be fair, the universities have their hand forced here. A lot +of incoming students are interested in startups. Universities are, +at least de facto, expected to prepare them for their careers. So +students who want to start startups hope universities can teach +them about startups. And whether universities can do this or not, +there's some pressure to claim they can, lest they lose applicants +to other universities that do.Can universities teach students about startups? Yes and no. They +can teach students about startups, but as I explained before, this +is not what you need to know. What you need to learn about are the +needs of your own users, and you can't do that until you actually +start the company. +[5] +So starting a startup is intrinsically +something you can only really learn by doing it. And it's impossible +to do that in college, for the reason I just explained: startups +take over your life. You can't start a startup for real as a +student, because if you start a startup for real you're not a student +anymore. You may be nominally a student for a bit, but you won't even +be that for long. +[6]Given this dichotomy, which of the two paths should you take? Be +a real student and not start a startup, or start a real startup and +not be a student? I can answer that one for you. Do not start a +startup in college. How to start a startup is just a subset of a +bigger problem you're trying to solve: how to have a good life. +And though starting a startup can be part of a good life for a lot +of ambitious people, age 20 is not the optimal time to do it. +Starting a startup is like a brutally fast depth-first search. Most +people should still be searching breadth-first at 20.You can do things in your early 20s that you can't do as well before +or after, like plunge deeply into projects on a whim and travel +super cheaply with no sense of a deadline. For unambitious people, +this sort of thing is the dreaded "failure to launch," but for the +ambitious ones it can be an incomparably valuable sort of exploration. +If you start a startup at 20 and you're sufficiently successful, +you'll never get to do it. +[7]Mark Zuckerberg will never get to bum around a foreign country. He +can do other things most people can't, like charter jets to fly him +to foreign countries. But success has taken a lot of the serendipity +out of his life. Facebook is running him as much as he's running +Facebook. And while it can be very cool to be in the grip of a +project you consider your life's work, there are advantages to +serendipity too, especially early in life. Among other things it +gives you more options to choose your life's work from.There's not even a tradeoff here. You're not sacrificing anything +if you forgo starting a startup at 20, because you're more likely +to succeed if you wait. In the unlikely case that you're 20 and +one of your side projects takes off like Facebook did, you'll face +a choice of running with it or not, and it may be reasonable to run +with it. But the usual way startups take off is for the founders +to make them take off, and it's gratuitously +stupid to do that at 20. +TryShould you do it at any age? I realize I've made startups sound +pretty hard. If I haven't, let me try again: starting a startup +is really hard. What if it's too hard? How can you tell if you're +up to this challenge?The answer is the fifth counterintuitive point: you can't tell. Your +life so far may have given you some idea what your prospects might +be if you tried to become a mathematician, or a professional football +player. But unless you've had a very strange life you haven't done +much that was like being a startup founder. +Starting a startup will change you a lot. So what you're trying +to estimate is not just what you are, but what you could grow into, +and who can do that?For the past 9 years it was my job to predict whether people would +have what it took to start successful startups. It was easy to +tell how smart they were, and most people reading this will be over +that threshold. The hard part was predicting how tough and ambitious they would become. There +may be no one who has more experience at trying to predict that, +so I can tell you how much an expert can know about it, and the +answer is: not much. I learned to keep a completely open mind about +which of the startups in each batch would turn out to be the stars.The founders sometimes think they know. Some arrive feeling sure +they will ace Y Combinator just as they've aced every one of the (few, +artificial, easy) tests they've faced in life so far. Others arrive +wondering how they got in, and hoping YC doesn't discover whatever +mistake caused it to accept them. But there is little correlation +between founders' initial attitudes and how well their companies +do.I've read that the same is true in the military — that the +swaggering recruits are no more likely to turn out to be really +tough than the quiet ones. And probably for the same reason: that +the tests involved are so different from the ones in their previous +lives.If you're absolutely terrified of starting a startup, you probably +shouldn't do it. But if you're merely unsure whether you're up to +it, the only way to find out is to try. Just not now. +IdeasSo if you want to start a startup one day, what should you do in +college? There are only two things you need initially: an idea and +cofounders. And the m.o. for getting both is the same. Which leads +to our sixth and last counterintuitive point: that the way to get +startup ideas is not to try to think of startup ideas.I've written a whole essay on this, +so I won't repeat it all here. But the short version is that if +you make a conscious effort to think of startup ideas, the ideas +you come up with will not merely be bad, but bad and plausible-sounding, +meaning you'll waste a lot of time on them before realizing they're +bad.The way to come up with good startup ideas is to take a step back. +Instead of making a conscious effort to think of startup ideas, +turn your mind into the type that startup ideas form in without any +conscious effort. In fact, so unconsciously that you don't even +realize at first that they're startup ideas.This is not only possible, it's how Apple, Yahoo, Google, and +Facebook all got started. None of these companies were even meant +to be companies at first. They were all just side projects. The +best startups almost have to start as side projects, because great +ideas tend to be such outliers that your conscious mind would reject +them as ideas for companies.Ok, so how do you turn your mind into the type that startup ideas +form in unconsciously? (1) Learn a lot about things that matter, +then (2) work on problems that interest you (3) with people you +like and respect. The third part, incidentally, is how you get +cofounders at the same time as the idea.The first time I wrote that paragraph, instead of "learn a lot about +things that matter," I wrote "become good at some technology." But +that prescription, though sufficient, is too narrow. What was +special about Brian Chesky and Joe Gebbia was not that they were +experts in technology. They were good at design, and perhaps even +more importantly, they were good at organizing groups and making +projects happen. So you don't have to work on technology per se, +so long as you work on problems demanding enough to stretch you.What kind of problems are those? That is very hard to answer in +the general case. History is full of examples of young people who +were working on important problems that no +one else at the time thought were important, and in particular +that their parents didn't think were important. On the other hand, +history is even fuller of examples of parents who thought their +kids were wasting their time and who were right. So how do you +know when you're working on real stuff? +[8]I know how I know. Real problems are interesting, and I am +self-indulgent in the sense that I always want to work on interesting +things, even if no one else cares about them (in fact, especially +if no one else cares about them), and find it very hard to make +myself work on boring things, even if they're supposed to be +important.My life is full of case after case where I worked on something just +because it seemed interesting, and it turned out later to be useful +in some worldly way. Y +Combinator itself was something I only did because it seemed +interesting. So I seem to have some sort of internal compass that +helps me out. But I don't know what other people have in their +heads. Maybe if I think more about this I can come up with heuristics +for recognizing genuinely interesting problems, but for the moment +the best I can offer is the hopelessly question-begging advice that +if you have a taste for genuinely interesting problems, indulging +it energetically is the best way to prepare yourself for a startup. +And indeed, probably also the best way to live. +[9]But although I can't explain in the general case what counts as an +interesting problem, I can tell you about a large subset of them. +If you think of technology as something that's spreading like a +sort of fractal stain, every moving point on the edge represents +an interesting problem. So one guaranteed way to turn your mind +into the type that has good startup ideas is to get yourself to the +leading edge of some technology — to cause yourself, as Paul +Buchheit put it, to "live in the future." When you reach that point, +ideas that will seem to other people uncannily prescient will seem +obvious to you. You may not realize they're startup ideas, but +you'll know they're something that ought to exist.For example, back at Harvard in the mid 90s a fellow grad student +of my friends Robert and Trevor wrote his own voice over IP software. +He didn't mean it to be a startup, and he never tried to turn it +into one. He just wanted to talk to his girlfriend in Taiwan without +paying for long distance calls, and since he was an expert on +networks it seemed obvious to him that the way to do it was turn +the sound into packets and ship it over the Internet. He never did +any more with his software than talk to his girlfriend, but this +is exactly the way the best startups get started.So strangely enough the optimal thing to do in college if you want +to be a successful startup founder is not some sort of new, vocational +version of college focused on "entrepreneurship." It's the classic +version of college as education for its own sake. If you want to +start a startup after college, what you should do in college is +learn powerful things. And if you have genuine intellectual +curiosity, that's what you'll naturally tend to do if you just +follow your own inclinations. +[10]The component of entrepreneurship that really matters is domain +expertise. The way to become Larry Page was to become an expert +on search. And the way to become an expert on search was to be +driven by genuine curiosity, not some ulterior motive.At its best, starting a startup is merely an ulterior motive for +curiosity. And you'll do it best if you introduce the ulterior +motive toward the end of the process.So here is the ultimate advice for young would-be startup founders, +boiled down to two words: just learn. +Notes[1] +Some founders listen more than others, and this tends to be a +predictor of success. One of the things I +remember about the Airbnbs during YC is how intently they listened.[2] +In fact, this is one of the reasons startups are possible. If +big companies weren't plagued by internal inefficiencies, they'd +be proportionately more effective, leaving less room for startups.[3] +In a startup you have to spend a lot of time on schleps, but this sort of work is merely +unglamorous, not bogus.[4] +What should you do if your true calling is gaming the system? +Management consulting.[5] +The company may not be incorporated, but if you start to get +significant numbers of users, you've started it, whether you realize +it yet or not.[6] +It shouldn't be that surprising that colleges can't teach +students how to be good startup founders, because they can't teach +them how to be good employees either.The way universities "teach" students how to be employees is to +hand off the task to companies via internship programs. But you +couldn't do the equivalent thing for startups, because by definition +if the students did well they would never come back.[7] +Charles Darwin was 22 when he received an invitation to travel +aboard the HMS Beagle as a naturalist. It was only because he was +otherwise unoccupied, to a degree that alarmed his family, that he +could accept it. And yet if he hadn't we probably would not know +his name.[8] +Parents can sometimes be especially conservative in this +department. There are some whose definition of important problems +includes only those on the critical path to med school.[9] +I did manage to think of a heuristic for detecting whether you +have a taste for interesting ideas: whether you find known boring +ideas intolerable. Could you endure studying literary theory, or +working in middle management at a large company?[10] +In fact, if your goal is to start a startup, you can stick +even more closely to the ideal of a liberal education than past +generations have. Back when students focused mainly on getting a +job after college, they thought at least a little about how the +courses they took might look to an employer. And perhaps even +worse, they might shy away from taking a difficult class lest they +get a low grade, which would harm their all-important GPA. Good +news: users don't care what your GPA +was. And I've never heard of investors caring either. Y Combinator +certainly never asks what classes you took in college or what grades +you got in them. +Thanks to Sam Altman, Paul Buchheit, John Collison, Patrick +Collison, Jessica Livingston, Robert Morris, Geoff Ralston, and +Fred Wilson for reading drafts of this. \ No newline at end of file diff --git a/docs/prompt b/docs/prompt index 30404ce..6f133cd 100644 --- a/docs/prompt +++ b/docs/prompt @@ -1 +1,222 @@ -TODO \ No newline at end of file +# Example App for text summarization & QA using minillmflow +from minillmflow import Node, BatchNode, Flow, BatchFlow, AsyncNode, AsyncFlow, BatchAsyncFlow +import os + +# 1) Implement a simple LLM helper (OpenAI in this example). +def call_LLM(prompt): + # Users must set an OpenAI API key; can also load from env var, etc. + openai.api_key = "YOUR_API_KEY_HERE" + r = openai.ChatCompletion.create( + model="gpt-4", + messages=[{"role": "user", "content": prompt}] + ) + return r.choices[0].message.content + +# 2) Create a shared store (dict) for Node/Flow data exchange. +# This can be replaced with a DB or other storage. +# Design the structure / schema based on the app requirements. +shared = {"data": {}, "summary": {}} + +# 3) Create a Node that loads data from disk into shared['data']. +class LoadData(Node): + # For compute-intensive operations, do them in prep(). + def prep(self, shared): + path = "../data/PaulGrahamEssaysLarge" + for filename in os.listdir(path): + with open(os.path.join(path, filename), 'r') as f: + shared['data'][filename] = f.read() + # If LLM was needed, we'd handle it in exec(). Not needed here. + # (idempotent so it can be retried if needed) + def exec(self,shared,prep_res): pass + # post() can update shared again or decide the next node (by return the action). + def post(self,shared,prep_res,exec_res): pass + +load_data = LoadData() +# Run the data-loading node once +load_data.run(shared) + +# 4) Create a Node that summarizes a single file using the LLM. +class SummarizeFile(Node): + def prep(self, shared): + # Use self.params (which must remain immutable during prep/exec/post). + # Typically, we only store identifying info in params (e.g., filename). + content = shared['data'][self.params['filename']] + return content + def exec(self, shared, prep_res): + content = prep_res + prompt = f"{content} Respond a summary of above in 10 words" + summary = call_llm(prompt) + return summary + def post(self, shared, prep_res, exec_res): + shared["summary"][self.params['filename']] = exec_res + +summarize_file = SummarizeFile() +# For testing, we set params directly on the node. +# In real usage, you'd set them in a Flow or BatchFlow. +summarize_file.set_params({"filename":"addiction.txt"}) +summarize_file.run(shared) + +# 5) If data is large, we can apply a map-reduce pattern: +# - MapSummaries(BatchNode) => chunk the file and summarize each chunk +# - ReduceSummaries(Node) => combine those chunk-level summaries +class MapSummaries(BatchNode): + def prep(self, shared): + content = shared['data'][self.params['filename']] + chunk_size = 10000 + chunks = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)] + # Must return an iterable (list or generator) for a BatchNode. + return chunks + def exec(self, shared, prep_res): + # Each iteration of prep_res corresponds to a single chunk. + chunk = prep_res + prompt = f"{chunk} Respond a summary of above in 10 words" + summary = call_llm(prompt) + return summary + def post(self, shared, prep_res, exec_res): + # exec_res is a list of exec() results (summaries for each chunk). + combined_summary = [f"{i}. {summary}" for i, summary in enumerate(exec_res)] + shared["summary"][self.params['filename']] = combined_summary + +class ReduceSummaries(Node): + def prep(self, shared): + # Retrieve the list of chunk summaries from shared storage + return shared["summary"][self.params['filename']] + def exec(self, shared, prep_res): + combined_summary = prep_res + prompt = f"{combined_summary} Respond a summary of above in 10 words" + summary = call_llm(prompt) + return summary + def post(self, shared, prep_res, exec_res): + # Store the combined summary as the final summary for this file. + shared["summary"][self.params['filename']] = exec_res + +map_summaries = MapSummaries() +reduce_summaries = ReduceSummaries() +# Link map_summaries to reduce_summaries with an action +# By default, the action is "default" (when post returns None, it takes "default" action) +# This is the same as map_summaries - "default" >> reduce_summaries +map_summaries >> reduce_summaries + +# We don't directly call map_summaries.run(shared), +# because that alone would process only the map step without reduce. + +# 6) Instead, create a Flow that starts from map_summaries (a Node) +# and automatically includes reduce_summaries. +# Note: A Flow can also start from any other Flow or BatchFlow. + + +file_summary_flow = Flow(start=map_summaries) +# When a flow params is set, it will recursively set its params to all nodes in the flow +file_summary_flow.set_params({"filename":"before.txt"}) +file_summary_flow.run(shared) + +# 7) Summarize all files using a BatchFlow that reruns file_summary_flow for each file +class SummarizeAllFiles(BatchFlow): + def prep(self, shared): + # Return a list of parameters to apply in each flow iteration. + # Each individual param will be merged with this node's own params + # Allowing nesting of multi-level BatchFlow. + # E.g., first level diretcory, second level file. + return [{"filename":filename} for filename in shared['data']] + +summarize_all_files = SummarizeAllFiles(start=file_summary_flow) +summarize_all_files.run(shared) + + +# 8) QA Agent: Find the most relevant file based on summary with actions +# if no question is asked: +# (a) end: terminate the flow +# if question is asked: +# if relevant file is found: +# (b) answer: move to answer node and read the whole file to answer the question +# if no relevant file is found: +# (c) retry: retry the process to find the relevant file +class FindRelevantFile(Node): + def prep(self, shared): + question = input("Enter a question: ") + formatted_list = [f"- '{filename}': {shared['summary'][filename]}" + for filename in shared['summary']] + return question, formatted_list + def exec(self, shared, prep_res): + question, formatted_list = prep_res + if not question: + return {"think":"no question", "has_relevant":False} + # Provide a structured YAML output that includes: + # - The chain of thought + # - Whether any relevant file was found + # - The most relevant file if found + prompt = f"""Question: {question} +Find the most relevant file from: +{formatted_list} +If no relevant file, explain why +Respond in yaml without additional information: +think: the question has/has no relevant file ... +has_relevant: true/false +most_relevant: filename""" + response = call_llm(prompt) + import yaml + result = yaml.safe_load(response) + # Ensure required fields are present + assert "think" in result + assert "has_relevant" in result + assert "most_relevant" in result if result["has_relevant"] else True + return result + # handle errors by returning a default response in case of exception after retries + def process_after_fail(self,shared,prep_res,exc): + # if not overridden, the default is to throw the exception + return {"think":"error finding the file", "has_relevant":False} + def post(self, shared, prep_res, exec_res): + question, _ = prep_res + # Decide what to do next based on the results + if not question: + print(f"No question asked") + return "end" + if exec_res["has_relevant"]: + # Store the question and most relevant file in shared + shared["question"] = question + shared["relevant_file"] = exec_res['most_relevant'] + print(f"Relevant file found: {exec_res['most_relevant']}") + return "answer" + else: + print(f"No relevant file found: {exec_res['think']}") + return "retry" + +class AnswerQuestion(Node): + def prep(self, shared): + question = shared['question'] + relevant_file = shared['relevant_file'] + # Read the whole file content + file_content = shared['data'][relevant_file] + return question, file_content + def exec(self, shared, prep_res): + question, file_content = prep_res + prompt = f"""Question: {question} +File: {file_content} +Answer the question in 50 words""" + response = call_llm(prompt) + return response + def post(self, shared, prep_res, exec_res): + print(f"Answer: {exec_res}") + +class NoOp(Node): + pass + +# Configure the QA agent with appropriate transitions and retries +find_relevant_file = FindRelevantFile(max_retries=3) +answer_question = AnswerQuestion() +no_op = NoOp() + +# Connect the nodes based on the actions they return +find_relevant_file - "answer" >> answer_question >> find_relevant_file +find_relevant_file - "retry" >> find_relevant_file +find_relevant_file - "end" >> no_op + +qa_agent = Flow(start=find_relevant_file) +qa_agent.run(shared) + + +# Above example demonstrates the use of minillmflow +# Next, build another app based on the same principles +# First, given the app's requirements, design the Node/Flow structure +# Then, design the data structure within shared storage, and how it's updated +# Finally, implement the Nodes and Flows to achieve the desired functionality \ No newline at end of file diff --git a/minillmflow/__init__.py b/minillmflow/__init__.py index 96305bc..9a5555b 100644 --- a/minillmflow/__init__.py +++ b/minillmflow/__init__.py @@ -3,9 +3,9 @@ import asyncio, warnings class BaseNode: def __init__(self): self.params,self.successors={},{} def set_params(self,params): self.params=params - def add_successor(self,node,cond="default"): - if cond in self.successors: warnings.warn(f"Overwriting successor for condition '{cond}'") - self.successors[cond]=node;return node + def add_successor(self,node,action="default"): + if action in self.successors: warnings.warn(f"Overwriting successor for action '{action}'") + self.successors[action]=node;return node def prep(self,shared): return None def exec(self,shared,prep_res): return None def _exec(self,shared,prep_res): return self.exec(shared,prep_res) @@ -18,13 +18,13 @@ class BaseNode: if self.successors: warnings.warn("Node won't run successors. Use a parent Flow instead.") return self._run(shared) def __rshift__(self,other): return self.add_successor(other) - def __sub__(self,cond): - if isinstance(cond,str): return _ConditionalTransition(self,cond) - raise TypeError("Condition must be a string") + def __sub__(self,action): + if isinstance(action,str): return _ConditionalTransition(self,action) + raise TypeError("Action must be a string") class _ConditionalTransition: - def __init__(self,src,cond): self.src,self.cond=src,cond - def __rshift__(self,tgt): return self.src.add_successor(tgt,self.cond) + def __init__(self,src,action): self.src,self.action=src,action + def __rshift__(self,tgt): return self.src.add_successor(tgt,self.action) class Node(BaseNode): def __init__(self,max_retries=1): @@ -42,16 +42,16 @@ class BatchNode(Node): def _exec(self,shared,items): return [super(Node,self)._exec(shared,i) for i in items] class Flow(BaseNode): - def __init__(self,start_node): + def __init__(self,start): super().__init__() - self.start_node=start_node - def get_next_node(self,curr,cond): - nxt=curr.successors.get(cond if cond is not None else "default") + self.start=start + def get_next_node(self,curr,action): + nxt=curr.successors.get(action if action is not None else "default") if not nxt and curr.successors: - warnings.warn(f"Flow ends: condition '{cond}' not found in {list(curr.successors)}") + warnings.warn(f"Flow ends: action '{action}' not found in {list(curr.successors)}") return nxt def _exec(self,shared,params=None): - curr,p=self.start_node,(params if params else {**self.params}) + curr,p=self.start,(params if params else {**self.params}) while curr: curr.set_params(p) c=curr._run(shared) @@ -83,7 +83,7 @@ class AsyncNode(Node): class AsyncFlow(Flow,AsyncNode): async def _exec_async(self,shared,params=None): - curr,p=self.start_node,(params if params else {**self.params}) + curr,p=self.start,(params if params else {**self.params}) while curr: curr.set_params(p) c=await curr._run_async(shared) if hasattr(curr,"run_async") else curr._run(shared) diff --git a/setup.py b/setup.py index 97caa74..3a38bbe 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ from setuptools import setup, find_packages setup( name="minillmflow", - version="0.0.0", + version="0.0.2", packages=find_packages(), author="Zachary Huang", author_email="zh2408@columbia.edu", diff --git a/tests/test_async_batch_flow.py b/tests/test_async_batch_flow.py index e0e911e..c8c7855 100644 --- a/tests/test_async_batch_flow.py +++ b/tests/test_async_batch_flow.py @@ -46,7 +46,7 @@ class TestAsyncBatchFlow(unittest.TestCase): } } - flow = SimpleTestAsyncBatchFlow(start_node=self.process_node) + flow = SimpleTestAsyncBatchFlow(start=self.process_node) asyncio.run(flow.run_async(shared_storage)) expected_results = { @@ -66,7 +66,7 @@ class TestAsyncBatchFlow(unittest.TestCase): 'input_data': {} } - flow = EmptyTestAsyncBatchFlow(start_node=self.process_node) + flow = EmptyTestAsyncBatchFlow(start=self.process_node) asyncio.run(flow.run_async(shared_storage)) self.assertEqual(shared_storage.get('results', {}), {}) @@ -85,7 +85,7 @@ class TestAsyncBatchFlow(unittest.TestCase): } } - flow = ErrorTestAsyncBatchFlow(start_node=AsyncErrorNode()) + flow = ErrorTestAsyncBatchFlow(start=AsyncErrorNode()) with self.assertRaises(ValueError): asyncio.run(flow.run_async(shared_storage)) @@ -126,7 +126,7 @@ class TestAsyncBatchFlow(unittest.TestCase): } } - flow = NestedAsyncBatchFlow(start_node=inner_node) + flow = NestedAsyncBatchFlow(start=inner_node) asyncio.run(flow.run_async(shared_storage)) expected_results = { @@ -162,7 +162,7 @@ class TestAsyncBatchFlow(unittest.TestCase): } } - flow = CustomParamAsyncBatchFlow(start_node=CustomParamAsyncNode()) + flow = CustomParamAsyncBatchFlow(start=CustomParamAsyncNode()) asyncio.run(flow.run_async(shared_storage)) expected_results = { diff --git a/tests/test_async_flow.py b/tests/test_async_flow.py index bda8ec5..ceb5a9a 100644 --- a/tests/test_async_flow.py +++ b/tests/test_async_flow.py @@ -86,14 +86,14 @@ class TestAsyncFlow(unittest.TestCase): """ # Create our nodes - start_node = AsyncNumberNode(5) + start = AsyncNumberNode(5) inc_node = AsyncIncrementNode() - # Chain them: start_node >> inc_node - start_node - "number_set" >> inc_node + # Chain them: start >> inc_node + start - "number_set" >> inc_node - # Create an AsyncFlow with start_node - flow = AsyncFlow(start_node) + # Create an AsyncFlow with start + flow = AsyncFlow(start) # We'll run the flow synchronously (which under the hood is asyncio.run()) shared_storage = {} @@ -135,15 +135,15 @@ class TestAsyncFlow(unittest.TestCase): shared_storage = {"value": 10} - start_node = BranchingAsyncNode() + start = BranchingAsyncNode() positive_node = PositiveNode() negative_node = NegativeNode() # Condition-based chaining - start_node - "positive_branch" >> positive_node - start_node - "negative_branch" >> negative_node + start - "positive_branch" >> positive_node + start - "negative_branch" >> negative_node - flow = AsyncFlow(start_node) + flow = AsyncFlow(start) asyncio.run(flow.run_async(shared_storage)) self.assertEqual(shared_storage["path"], "positive", diff --git a/tests/test_batch_flow.py b/tests/test_batch_flow.py index cd2463b..6706175 100644 --- a/tests/test_batch_flow.py +++ b/tests/test_batch_flow.py @@ -40,7 +40,7 @@ class TestBatchFlow(unittest.TestCase): } } - flow = SimpleTestBatchFlow(start_node=self.process_node) + flow = SimpleTestBatchFlow(start=self.process_node) flow.run(shared_storage) expected_results = { @@ -60,7 +60,7 @@ class TestBatchFlow(unittest.TestCase): 'input_data': {} } - flow = EmptyTestBatchFlow(start_node=self.process_node) + flow = EmptyTestBatchFlow(start=self.process_node) flow.run(shared_storage) self.assertEqual(shared_storage.get('results', {}), {}) @@ -77,7 +77,7 @@ class TestBatchFlow(unittest.TestCase): } } - flow = SingleItemBatchFlow(start_node=self.process_node) + flow = SingleItemBatchFlow(start=self.process_node) flow.run(shared_storage) expected_results = { @@ -99,7 +99,7 @@ class TestBatchFlow(unittest.TestCase): } } - flow = ErrorTestBatchFlow(start_node=ErrorProcessNode()) + flow = ErrorTestBatchFlow(start=ErrorProcessNode()) with self.assertRaises(ValueError): flow.run(shared_storage) @@ -136,7 +136,7 @@ class TestBatchFlow(unittest.TestCase): } } - flow = NestedBatchFlow(start_node=inner_node) + flow = NestedBatchFlow(start=inner_node) flow.run(shared_storage) expected_results = { @@ -170,7 +170,7 @@ class TestBatchFlow(unittest.TestCase): } } - flow = CustomParamBatchFlow(start_node=CustomParamNode()) + flow = CustomParamBatchFlow(start=CustomParamNode()) flow.run(shared_storage) expected_results = { diff --git a/tests/test_batch_node.py b/tests/test_batch_node.py index 8f3a145..06f4703 100644 --- a/tests/test_batch_node.py +++ b/tests/test_batch_node.py @@ -74,7 +74,7 @@ class TestBatchNode(unittest.TestCase): chunk_node >> reduce_node # Create and run pipeline - pipeline = Flow(start_node=chunk_node) + pipeline = Flow(start=chunk_node) pipeline.run(shared_storage) self.assertEqual(shared_storage['total'], expected_sum) @@ -95,7 +95,7 @@ class TestBatchNode(unittest.TestCase): reduce_node = SumReduceNode() chunk_node >> reduce_node - pipeline = Flow(start_node=chunk_node) + pipeline = Flow(start=chunk_node) pipeline.run(shared_storage) self.assertEqual(shared_storage['total'], expected_sum) @@ -116,7 +116,7 @@ class TestBatchNode(unittest.TestCase): reduce_node = SumReduceNode() chunk_node >> reduce_node - pipeline = Flow(start_node=chunk_node) + pipeline = Flow(start=chunk_node) pipeline.run(shared_storage) self.assertEqual(shared_storage['total'], expected_sum) @@ -136,7 +136,7 @@ class TestBatchNode(unittest.TestCase): reduce_node = SumReduceNode() chunk_node >> reduce_node - pipeline = Flow(start_node=chunk_node) + pipeline = Flow(start=chunk_node) pipeline.run(shared_storage) self.assertEqual(shared_storage['total'], expected_sum) @@ -153,7 +153,7 @@ class TestBatchNode(unittest.TestCase): reduce_node = SumReduceNode() chunk_node >> reduce_node - pipeline = Flow(start_node=chunk_node) + pipeline = Flow(start=chunk_node) pipeline.run(shared_storage) self.assertEqual(shared_storage['total'], 0) diff --git a/tests/test_flow_basic.py b/tests/test_flow_basic.py index 3f59744..a09d51c 100644 --- a/tests/test_flow_basic.py +++ b/tests/test_flow_basic.py @@ -45,7 +45,7 @@ class TestNode(unittest.TestCase): def test_single_number(self): shared_storage = {} start = NumberNode(5) - pipeline = Flow(start_node=start) + pipeline = Flow(start=start) pipeline.run(shared_storage) self.assertEqual(shared_storage['current'], 5) @@ -65,7 +65,7 @@ class TestNode(unittest.TestCase): # Chain them in sequence using the >> operator n1 >> n2 >> n3 - pipeline = Flow(start_node=n1) + pipeline = Flow(start=n1) pipeline.run(shared_storage) self.assertEqual(shared_storage['current'], 16) @@ -94,7 +94,7 @@ class TestNode(unittest.TestCase): check - "positive" >> add_if_positive check - "negative" >> add_if_negative - pipeline = Flow(start_node=start) + pipeline = Flow(start=start) pipeline.run(shared_storage) self.assertEqual(shared_storage['current'], 15) @@ -118,7 +118,7 @@ class TestNode(unittest.TestCase): check - "positive" >> add_if_positive check - "negative" >> add_if_negative - pipeline = Flow(start_node=start) + pipeline = Flow(start=start) pipeline.run(shared_storage) # Should have gone down the 'negative' branch @@ -145,7 +145,7 @@ class TestNode(unittest.TestCase): # Attach a no-op node on the negative branch to avoid warning check - 'negative' >> no_op - pipeline = Flow(start_node=n1) + pipeline = Flow(start=n1) pipeline.run(shared_storage) # final result should be -2: (10 -> 7 -> 4 -> 1 -> -2) diff --git a/tests/test_flow_composition.py b/tests/test_flow_composition.py index bac8fa0..84403b0 100644 --- a/tests/test_flow_composition.py +++ b/tests/test_flow_composition.py @@ -35,21 +35,21 @@ class TestFlowComposition(unittest.TestCase): def test_flow_as_node(self): """ 1) Create a Flow (f1) starting with NumberNode(5), then AddNode(10), then MultiplyNode(2). - 2) Create a second Flow (f2) whose start_node is f1. + 2) Create a second Flow (f2) whose start is f1. 3) Create a wrapper Flow (f3) that contains f2 to ensure proper execution. Expected final result in shared_storage['current']: (5 + 10) * 2 = 30. """ shared_storage = {} # Inner flow f1 - f1 = Flow(start_node=NumberNode(5)) + f1 = Flow(start=NumberNode(5)) f1 >> AddNode(10) >> MultiplyNode(2) # f2 starts with f1 - f2 = Flow(start_node=f1) + f2 = Flow(start=f1) # Wrapper flow f3 to ensure proper execution - f3 = Flow(start_node=f2) + f3 = Flow(start=f2) f3.run(shared_storage) self.assertEqual(shared_storage['current'], 30) @@ -65,15 +65,15 @@ class TestFlowComposition(unittest.TestCase): shared_storage = {} # Build the inner flow - inner_flow = Flow(start_node=NumberNode(5)) + inner_flow = Flow(start=NumberNode(5)) inner_flow >> AddNode(3) - # Build the middle flow, whose start_node is the inner flow - middle_flow = Flow(start_node=inner_flow) + # Build the middle flow, whose start is the inner flow + middle_flow = Flow(start=inner_flow) middle_flow >> MultiplyNode(4) # Wrapper flow to ensure proper execution - wrapper_flow = Flow(start_node=middle_flow) + wrapper_flow = Flow(start=middle_flow) wrapper_flow.run(shared_storage) self.assertEqual(shared_storage['current'], 32) @@ -91,16 +91,16 @@ class TestFlowComposition(unittest.TestCase): # flow1 numbernode = NumberNode(10) numbernode >> AddNode(10) - flow1 = Flow(start_node=numbernode) + flow1 = Flow(start=numbernode) # flow2 - flow2 = Flow(start_node=MultiplyNode(2)) + flow2 = Flow(start=MultiplyNode(2)) # Chain flow1 to flow2 flow1 >> flow2 # Wrapper flow to ensure proper execution - wrapper_flow = Flow(start_node=flow1) + wrapper_flow = Flow(start=flow1) wrapper_flow.run(shared_storage) self.assertEqual(shared_storage['current'], 40)