-
InfluxDB
Power Real-Time Data Analytics at Scale. Get real-time insights from all types of time series data with InfluxDB. Ingest, query, and analyze billions of data points in real-time with unbounded cardinality.
import requests from datetime import datetime from pathlib import Path from waybackpy import WaybackMachineSaveAPI #https://github.com/akamhy/waybackpy ARCHIVE = False LOCAL_SAVE = True #https://www.cmegroup.com/clearing/operations-and-deliveries/nymex-delivery-notices.html urls = [ #COMEX & NYMEX Metal Delivery Notices "https://www.cmegroup.com/delivery_reports/MetalsIssuesAndStopsReport.pdf", "https://www.cmegroup.com/delivery_reports/MetalsIssuesAndStopsMTDReport.pdf", "https://www.cmegroup.com/delivery_reports/MetalsIssuesAndStopsYTDReport.pdf", #NYMEX Energy Delivery Notice "https://www.cmegroup.com/delivery_reports/EnergiesIssuesAndStopsReport.pdf", "https://www.cmegroup.com/delivery_reports/EnergiesIssuesAndStopsYTDReport.pdf", #Warehouse & Depository Stocks "https://www.cmegroup.com/delivery_reports/Gold_Stocks.xls", "https://www.cmegroup.com/delivery_reports/Gold_Kilo_Stocks.xls", "https://www.cmegroup.com/delivery_reports/Silver_stocks.xls", "https://www.cmegroup.com/delivery_reports/Copper_Stocks.xls", "https://www.cmegroup.com/delivery_reports/PA-PL_Stck_Rprt.xls", "https://www.cmegroup.com/delivery_reports/Aluminum_Stocks.xls", "https://www.cmegroup.com/delivery_reports/Zinc_Stocks.xls", "https://www.cmegroup.com/delivery_reports/Lead_Stocks.xls" ] user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36' #required for both wayback and cmegroup.com headers = {'User-Agent': user_agent} #present yourself as an updated Chrome browser if ARCHIVE: for url in urls: filename = url.split("/")[-1] print(f"Archiving {filename} on Wayback Machine...") save_api = WaybackMachineSaveAPI(url, user_agent) #limited to 15 requests / minute / IP. My VPN IP was already throttled :( Couldn't even get this to work with normal IP. Returned 429 error.... res = save_api.save() print(f"Res: {res}") if LOCAL_SAVE: datestr = datetime.now().strftime('%m-%d-%Y') datedir = Path(datestr) datedir.mkdir(exist_ok=True) for url in urls: filename = url.split("/")[-1] print(f"Fetching {filename}...") try: resp = requests.get(url, timeout=3, allow_redirects=True, headers=headers) if resp.ok: filepath = datedir / filename if not filepath.exists(): with open(filepath, mode="wb") as f: f.write(resp.content) else: print(f"ERROR: Filepath already exists: {filepath}") else: print(f"ERROR: response for {filename}: {resp}") except requests.ReadTimeout: print("timeout")
Related posts
-
download all captures of a page in archive.org
-
Well worth the price
-
any way to archive all my bookmarks on archive.org?
-
Is there a way to download all the files Internet Archive has captured for a domain? I am trying to recover tweets from a suspended twitter account, but the account as a whole was never captured in the Wayback Machine, just some individual tweets and json files.
-
简单run个脚本使用 wayback machine 接口批量备份知乎问题冲塔回答