From cb8025becdb64c3d99a4ca77702e043eb9080c38 Mon Sep 17 00:00:00 2001 From: hak8or Date: Tue, 9 Sep 2025 22:47:45 -0400 Subject: [PATCH] Fetch script now much more capable with headed chromium + playright --- systemd/scraper_fetch.sh | 102 ++++++++++++++++++++++++++++++--------- 1 file changed, 80 insertions(+), 22 deletions(-) diff --git a/systemd/scraper_fetch.sh b/systemd/scraper_fetch.sh index 29c21b0..efa2259 100755 --- a/systemd/scraper_fetch.sh +++ b/systemd/scraper_fetch.sh @@ -7,6 +7,8 @@ URL_SEARCHTERM_NONE="&_nkw=" URL_LOCATION_NORTHAMERICA="&LH_PrefLoc=3" URL_SORTBY_NEWLY_LISTED="&_sop=10" URL_SORTBY_ENDING_SOONEST="&_sop=1" +URL_CATEGORY_MINIPC_ALLINONE="&_sacat=179" +URL_CATEGORY_SSD="&_sacat=175669" URL_BASE_LISTING="https://www.ebay.com/sch/i.html?" if [ -z "${XDG_DATA_HOME}" ]; then @@ -14,34 +16,90 @@ if [ -z "${XDG_DATA_HOME}" ]; then exit fi -DIR_SSDS="$XDG_DATA_HOME/scraper/raw_scraped/ssd" -mkdir -p "$DIR_SSDS" -if [ ! -s "$DIR_SSDS/url.json" ]; then - URL_CATEGORY_SSD="&_sacat=175669" - URL_SSDS="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_SSD&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1$URL_PER_PAGE_240" - echo "{\"url\": \"$URL_SSDS\"}" > "$DIR_SSDS/url.json" -fi -curl "$(jq '.url' $DIR_SSDS/url.json --raw-output)$URL_SORTBY_NEWLY_LISTED" | zstd -z --ultra -19 -o "$DIR_SSDS/$(date +%s).html.zst" -sleep 2 -curl "$(jq '.url' $DIR_SSDS/url.json --raw-output)$URL_SORTBY_ENDING_SOONEST" | zstd -z --ultra -19 -o "$DIR_SSDS/$(date +%s).html.zst" +# Heh, so Ebay started to block my scraping efforts after a while. I couldn't +# get this working with wget, so in the end I decided to go for this route which +# is quite ugly, but should work in the end. We effectively run a non headless +# version of a browser with various realistic headers and screen dimensions. +# Lastly, we give the page an extra 5 seconds to run any potential javascript +# often used to counter scraping or bots. +fetch_compress_save_html() { + local url="$1" + local output_file="$2" -DIR_MINIPC="$XDG_DATA_HOME/scraper/raw_scraped/minipc" -mkdir -p "$DIR_MINIPC" -if [ ! -s "$DIR_MINIPC/url.json" ]; then - URL_CATEGORY_MINIPC_ALLINONE="&_sacat=179" - URL_MINIPC="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_MINIPC_ALLINONE&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1$URL_PER_PAGE_240" - echo "{\"url\": \"$URL_MINIPC\"}" > "$DIR_MINIPC/url.json" -fi -curl "$(jq '.url' $DIR_MINIPC/url.json --raw-output)$URL_SORTBY_NEWLY_LISTED" | zstd -z --ultra -19 -o "$DIR_MINIPC/$(date +%s).html.zst" -sleep 2 -curl "$(jq '.url' $DIR_MINIPC/url.json --raw-output)$URL_SORTBY_ENDING_SOONEST" | zstd -z --ultra -19 -o "$DIR_MINIPC/$(date +%s).html.zst" + echo Fetching $url + xvfb-run --server-args="-screen 0 1024x768x24" \ + uv run --with playwright --with playwright-stealth - $url <<'EOF' | zstd -z --ultra -19 -o $output_file +import asyncio +import sys +from playwright.async_api import async_playwright +from playwright_stealth import Stealth + +async def main(): + async with Stealth().use_async(async_playwright()) as p: + browser = await p.chromium.launch( + executable_path='/usr/bin/chromium', + args=[ + '--no-sandbox', + '--disable-dev-shm-usage', + '--disable-blink-features=AutomationControlled', + "--window-size=1901,1018" + ], + headless=False + ) + # Create context with user agent + context = await browser.new_context( + color_scheme=r"light", + locale=r"en-US,en;q=0.9", + user_agent=r"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36", + timezone_id=r"America/New_York", + extra_http_headers={ + "origin": "https://www.ebay.com", + "accept": "*/*", + "accept-encoding": "gzip, deflate, br, zstd", + "cache-control": "no-cache", + "accept-language": "en-US,en;q=0.9" + } + ) + page = await context.new_page() + + await page.goto(sys.argv[1], wait_until="domcontentloaded") + await page.wait_for_timeout(5000) + print(await page.content()) + await browser.close() + +asyncio.run(main()) +EOF +} + +fetch() { + local name="$1" + local url_param="$2" + + DIR="$XDG_DATA_HOME/scraper/raw_scraped/$name" + mkdir -p "$DIR" + if [ ! -s "$DIR/url.json" ]; then + local URL="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$url_param&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1$URL_PER_PAGE_240" + echo "{\"url\": \"$URL\"}" > "$DIR/url.json" + fi + + URL_NEWEST="$(jq '.url' $DIR/url.json --raw-output)$URL_SORTBY_NEWLY_LISTED" + fetch_compress_save_html $URL_NEWEST "$DIR/$(date +%s).html.zst" + + URL_ENDING="$(jq '.url' $DIR/url.json --raw-output)$URL_SORTBY_ENDING_SOONEST" + fetch_compress_save_html $URL_ENDING "$DIR/$(date +%s).html.zst" +} + +fetch "ssd" "$URL_CATEGORY_SSD" +fetch "minipc" "$URL_CATEGORY_MINIPC_ALLINONE" -# Little helper to ensure we see entries in journald -echo Done # If needing to do a mass compression; # fd '\.html$' -x zstd -z --ultra -19 -o {}.zst {} +# If needing to purge bogus downloads +# fd --size -100K .html.zst -x ls -lah {} +# fd --size -100K .html.zst -x rm {} + # Level compression analysis; # # A single scraped result;