#!/bin/env bash URL_PER_PAGE_60="&_ipg=60" URL_PER_PAGE_240="&_ipg=240" URL_MIN_PRICE_USD_60="&_udlo=60.00" URL_SEARCHTERM_NONE="&_nkw=" URL_LOCATION_NORTHAMERICA="&LH_PrefLoc=3" URL_SORTBY_NEWLY_LISTED="&_sop=10" URL_SORTBY_ENDING_SOONEST="&_sop=1" URL_CATEGORY_MINIPC_ALLINONE="&_sacat=179" URL_CATEGORY_SSD="&_sacat=175669" URL_BASE_LISTING="https://www.ebay.com/sch/i.html?" if [ -z "${XDG_DATA_HOME}" ]; then echo "XDG_DATA_HOME was not set, bailing!" exit fi # Heh, so Ebay started to block my scraping efforts after a while. I couldn't # get this working with wget, so in the end I decided to go for this route which # is quite ugly, but should work in the end. We effectively run a non headless # version of a browser with various realistic headers and screen dimensions. # Lastly, we give the page an extra 5 seconds to run any potential javascript # often used to counter scraping or bots. fetch_compress_save_html() { local url="$1" local output_file="$2" echo Fetching $url xvfb-run --server-args="-screen 0 1024x768x24" \ uv run --with playwright --with playwright-stealth - $url <<'EOF' | zstd -z --ultra -19 -o $output_file import asyncio import sys from playwright.async_api import async_playwright from playwright_stealth import Stealth async def main(): async with Stealth().use_async(async_playwright()) as p: browser = await p.chromium.launch( executable_path='/usr/bin/chromium', args=[ '--no-sandbox', '--disable-dev-shm-usage', '--disable-blink-features=AutomationControlled', "--window-size=1901,1018" ], headless=False ) # Create context with user agent context = await browser.new_context( color_scheme=r"light", locale=r"en-US,en;q=0.9", user_agent=r"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36", timezone_id=r"America/New_York", extra_http_headers={ "origin": "https://www.ebay.com", "accept": "*/*", "accept-encoding": "gzip, deflate, br, zstd", "cache-control": "no-cache", "accept-language": "en-US,en;q=0.9" } ) page = await context.new_page() await page.goto(sys.argv[1], wait_until="domcontentloaded") await page.wait_for_timeout(5000) print(await page.content()) await browser.close() asyncio.run(main()) EOF } fetch() { local name="$1" local url_param="$2" DIR="$XDG_DATA_HOME/scraper/raw_scraped/$name" mkdir -p "$DIR" if [ ! -s "$DIR/url.json" ]; then local URL="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$url_param&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1$URL_PER_PAGE_240" echo "{\"url\": \"$URL\"}" > "$DIR/url.json" fi URL_NEWEST="$(jq '.url' $DIR/url.json --raw-output)$URL_SORTBY_NEWLY_LISTED" fetch_compress_save_html $URL_NEWEST "$DIR/$(date +%s).html.zst" URL_ENDING="$(jq '.url' $DIR/url.json --raw-output)$URL_SORTBY_ENDING_SOONEST" fetch_compress_save_html $URL_ENDING "$DIR/$(date +%s).html.zst" } fetch "ssd" "$URL_CATEGORY_SSD" fetch "minipc" "$URL_CATEGORY_MINIPC_ALLINONE" # If needing to do a mass compression; # fd '\.html$' -x zstd -z --ultra -19 -o {}.zst {} # If needing to purge bogus downloads # fd --size -100K .html.zst -x ls -lah {} # fd --size -100K .html.zst -x rm {} # Level compression analysis; # # A single scraped result; # for lvl in $(seq 3 22); zstd --compress --ultra -o 1755012328.html.zst$lvl -$lvl 1755012328.html; end # 1755012328.html : 9.04% ( 2.60 MiB => 240 KiB, 1755012328.html.zst3) # 1755012328.html : 9.04% ( 2.60 MiB => 240 KiB, 1755012328.html.zst4) # 1755012328.html : 8.80% ( 2.60 MiB => 234 KiB, 1755012328.html.zst5) # 1755012328.html : 8.58% ( 2.60 MiB => 228 KiB, 1755012328.html.zst6) # 1755012328.html : 8.54% ( 2.60 MiB => 227 KiB, 1755012328.html.zst7) # 1755012328.html : 8.45% ( 2.60 MiB => 225 KiB, 1755012328.html.zst8) # 1755012328.html : 8.34% ( 2.60 MiB => 222 KiB, 1755012328.html.zst9) # 1755012328.html : 8.30% ( 2.60 MiB => 221 KiB, 1755012328.html.zst10) # 1755012328.html : 8.28% ( 2.60 MiB => 220 KiB, 1755012328.html.zst11) # 1755012328.html : 8.28% ( 2.60 MiB => 220 KiB, 1755012328.html.zst12) # 1755012328.html : 8.32% ( 2.60 MiB => 221 KiB, 1755012328.html.zst13) # 1755012328.html : 8.29% ( 2.60 MiB => 221 KiB, 1755012328.html.zst14) # 1755012328.html : 8.25% ( 2.60 MiB => 219 KiB, 1755012328.html.zst15) # 1755012328.html : 8.28% ( 2.60 MiB => 220 KiB, 1755012328.html.zst16) # 1755012328.html : 8.20% ( 2.60 MiB => 218 KiB, 1755012328.html.zst17) # 1755012328.html : 8.23% ( 2.60 MiB => 219 KiB, 1755012328.html.zst18) # 1755012328.html : 7.99% ( 2.60 MiB => 213 KiB, 1755012328.html.zst19) # 1755012328.html : 7.99% ( 2.60 MiB => 213 KiB, 1755012328.html.zst20) # 1755012328.html : 7.93% ( 2.60 MiB => 211 KiB, 1755012328.html.zst21) # 1755012328.html : 7.91% ( 2.60 MiB => 211 KiB, 1755012328.html.zst22) # # Lets see if we get benefits tar'ing and them compressing; # -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755012328.html # -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755012331.html # -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755015932.html # -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755015929.html # -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755019567.html # -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755019564.html # -rw-r--r-- 1 hak8or users 16M Sep 1 12:23 175501.tar # ➜ for lvl in $(seq 3 22); zstd --compress --ultra -o 175501.tar.$lvl -$lvl 175501.tar; end # 175501.tar : 8.91% ( 15.6 MiB => 1.39 MiB, 175501.tar.3) # 175501.tar : 8.92% ( 15.6 MiB => 1.39 MiB, 175501.tar.4) # 175501.tar : 8.65% ( 15.6 MiB => 1.35 MiB, 175501.tar.5) # 175501.tar : 8.42% ( 15.6 MiB => 1.31 MiB, 175501.tar.6) # 175501.tar : 8.36% ( 15.6 MiB => 1.30 MiB, 175501.tar.7) # 175501.tar : 8.25% ( 15.6 MiB => 1.28 MiB, 175501.tar.8) # 175501.tar : 5.36% ( 15.6 MiB => 854 KiB, 175501.tar.9) # 175501.tar : 5.32% ( 15.6 MiB => 847 KiB, 175501.tar.10) # 175501.tar : 5.30% ( 15.6 MiB => 844 KiB, 175501.tar.11) # 175501.tar : 5.30% ( 15.6 MiB => 844 KiB, 175501.tar.12) # 175501.tar : 5.48% ( 15.6 MiB => 872 KiB, 175501.tar.13) # 175501.tar : 5.42% ( 15.6 MiB => 864 KiB, 175501.tar.14) # 175501.tar : 5.19% ( 15.6 MiB => 828 KiB, 175501.tar.15) # 175501.tar : 5.31% ( 15.6 MiB => 845 KiB, 175501.tar.16) # 175501.tar : 5.01% ( 15.6 MiB => 798 KiB, 175501.tar.17) # 175501.tar : 5.04% ( 15.6 MiB => 803 KiB, 175501.tar.18) # 175501.tar : 4.84% ( 15.6 MiB => 771 KiB, 175501.tar.19) # 175501.tar : 4.79% ( 15.6 MiB => 764 KiB, 175501.tar.20) # 175501.tar : 4.74% ( 15.6 MiB => 755 KiB, 175501.tar.21) # 175501.tar : 4.73% ( 15.6 MiB => 753 KiB, 175501.tar.22)