diff --git a/content.py b/content.py index 8debc49..436b454 100644 --- a/content.py +++ b/content.py @@ -996,6 +996,250 @@ def repair_ezproxy_links(): pass + +def download_web(): + import argparse, os, re, time, hashlib, mimetypes, subprocess + from collections import deque + from urllib.parse import urlsplit, urlunsplit, urljoin + import posixpath as ppath + import requests + from lxml import html + + SESSION = requests.Session() + SESSION.headers.update({ + "User-Agent": "MiniXPathCrawler/1.0 (+for personal archiving; contact admin if issues)" + }) + + def normalize_path(path: str) -> str: + np = ppath.normpath(path or "/") + if not np.startswith("/"): + np = "/" + np + return np + + def base_dir_of(path: str) -> str: + # Ensure trailing slash for folder comparison + if not path or path.endswith("/"): + bd = path or "/" + else: + bd = ppath.dirname(path) + "/" + bd = normalize_path(bd) + if not bd.endswith("/"): + bd += "/" + return bd + + def canonical_url(u: str, drop_query=True) -> str: + sp = urlsplit(u) + path = normalize_path(sp.path) + if drop_query: + sp = sp._replace(path=path, query="", fragment="") + else: + sp = sp._replace(path=path, fragment="") + return urlunsplit(sp) + + def same_folder_or_below(start_url: str, link_url: str) -> bool: + su = urlsplit(start_url); lu = urlsplit(link_url) + if su.scheme != lu.scheme or su.netloc != lu.netloc: + return False + bd = base_dir_of(su.path) # e.g., "/a/b/" + tp = normalize_path(lu.path) # e.g., "/a/b/page.html" + return (tp == bd[:-1]) or tp.startswith(bd) + + def is_html_response(resp: requests.Response) -> bool: + ctype = resp.headers.get("Content-Type", "") + return "html" in ctype.lower() + + def fetch_html(url: str, timeout=20): + try: + r = SESSION.get(url, timeout=timeout, allow_redirects=True) + except requests.RequestException: + return None, None + if r.status_code != 200 or not is_html_response(r): + return None, None + try: + doc = html.fromstring(r.content) + except Exception: + return None, None + # make links absolute for easier handling of images and hrefs + doc.make_links_absolute(r.url) + return r, doc + + def safe_filename_from_url(u: str, default_ext=".bin") -> str: + # hash + best-effort extension + h = hashlib.sha1(u.encode("utf-8")).hexdigest()[:16] + ext = "" + path = urlsplit(u).path + if "." in path: + ext = "." + path.split(".")[-1].split("?")[0].split("#")[0] + if not re.match(r"^\.[A-Za-z0-9]{1,5}$", ext): + ext = "" + return h + (ext or default_ext) + + def download_image(img_url: str, assets_dir: str) -> str | None: + try: + r = SESSION.get(img_url, timeout=20, stream=True) + except requests.RequestException: + return None + if r.status_code != 200: + return None + # extension: prefer from Content-Type + ext = None + ctype = r.headers.get("Content-Type", "") + if "/" in ctype: + ext_guess = mimetypes.guess_extension(ctype.split(";")[0].strip()) + if ext_guess: + ext = ext_guess + fname = safe_filename_from_url(img_url, default_ext=ext or ".img") + os.makedirs(assets_dir, exist_ok=True) + fpath = os.path.join(assets_dir, fname) + try: + with open(fpath, "wb") as f: + for chunk in r.iter_content(65536): + if chunk: + f.write(chunk) + except Exception: + return None + return fpath + + def html_fragment_from_xpath(doc, xpath_expr: str, assets_dir: str): + nodes = doc.xpath(xpath_expr) + if not nodes: + return None, None # (html_fragment, title) + # Remove