Fetch script now much more capable with headed chromium + playright

2025-09-09 22:47:45 -04:00
parent 0039078f41
commit cb8025becd
1 changed files with 80 additions and 22 deletions
--- a/systemd/scraper_fetch.sh
+++ b/systemd/scraper_fetch.sh
@@ -7,6 +7,8 @@ URL_SEARCHTERM_NONE="&_nkw="
 URL_LOCATION_NORTHAMERICA="&LH_PrefLoc=3"
 URL_SORTBY_NEWLY_LISTED="&_sop=10"
 URL_SORTBY_ENDING_SOONEST="&_sop=1"
+URL_CATEGORY_MINIPC_ALLINONE="&_sacat=179"
+URL_CATEGORY_SSD="&_sacat=175669"
 URL_BASE_LISTING="https://www.ebay.com/sch/i.html?"

 if [ -z "${XDG_DATA_HOME}" ]; then
@@ -14,34 +16,90 @@ if [ -z "${XDG_DATA_HOME}" ]; then
    exit
 fi

-DIR_SSDS="$XDG_DATA_HOME/scraper/raw_scraped/ssd"
-mkdir -p "$DIR_SSDS"
-if [ ! -s "$DIR_SSDS/url.json" ]; then
-    URL_CATEGORY_SSD="&_sacat=175669"
-    URL_SSDS="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_SSD&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1$URL_PER_PAGE_240"
-    echo "{\"url\": \"$URL_SSDS\"}" > "$DIR_SSDS/url.json"
-fi
-curl "$(jq '.url' $DIR_SSDS/url.json --raw-output)$URL_SORTBY_NEWLY_LISTED" | zstd -z --ultra -19 -o "$DIR_SSDS/$(date +%s).html.zst"
-sleep 2
-curl "$(jq '.url' $DIR_SSDS/url.json --raw-output)$URL_SORTBY_ENDING_SOONEST" | zstd -z --ultra -19 -o "$DIR_SSDS/$(date +%s).html.zst"
+# Heh, so Ebay started to block my scraping efforts after a while. I couldn't
+# get this working with wget, so in the end I decided to go for this route which
+# is quite ugly, but should work in the end. We effectively run a non headless
+# version of a browser with various realistic headers and screen dimensions.
+# Lastly, we give the page an extra 5 seconds to run any potential javascript
+# often used to counter scraping or bots.
+fetch_compress_save_html() {
+    local url="$1"
+    local output_file="$2"

-DIR_MINIPC="$XDG_DATA_HOME/scraper/raw_scraped/minipc"
-mkdir -p "$DIR_MINIPC"
-if [ ! -s "$DIR_MINIPC/url.json" ]; then
-    URL_CATEGORY_MINIPC_ALLINONE="&_sacat=179"
-    URL_MINIPC="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_MINIPC_ALLINONE&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1$URL_PER_PAGE_240"
-    echo "{\"url\": \"$URL_MINIPC\"}" > "$DIR_MINIPC/url.json"
-fi
-curl "$(jq '.url' $DIR_MINIPC/url.json --raw-output)$URL_SORTBY_NEWLY_LISTED" | zstd -z --ultra -19 -o "$DIR_MINIPC/$(date +%s).html.zst"
-sleep 2
-curl "$(jq '.url' $DIR_MINIPC/url.json --raw-output)$URL_SORTBY_ENDING_SOONEST" | zstd -z --ultra -19 -o "$DIR_MINIPC/$(date +%s).html.zst"
+    echo Fetching $url
+    xvfb-run --server-args="-screen 0 1024x768x24" \
+    uv run --with playwright --with playwright-stealth - $url <<'EOF' | zstd -z --ultra -19 -o $output_file
+import asyncio
+import sys
+from playwright.async_api import async_playwright
+from playwright_stealth import Stealth
+
+async def main():
+    async with Stealth().use_async(async_playwright()) as p:
+        browser =  await p.chromium.launch(
+            executable_path='/usr/bin/chromium',
+            args=[
+                '--no-sandbox',
+                '--disable-dev-shm-usage',
+                '--disable-blink-features=AutomationControlled',
+                "--window-size=1901,1018"
+            ],
+            headless=False
+        )
+        # Create context with user agent
+        context = await browser.new_context(
+            color_scheme=r"light",
+            locale=r"en-US,en;q=0.9",
+            user_agent=r"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36",
+            timezone_id=r"America/New_York",
+            extra_http_headers={
+                "origin": "https://www.ebay.com",
+                "accept": "*/*",
+                "accept-encoding": "gzip, deflate, br, zstd",
+                "cache-control": "no-cache",
+                "accept-language": "en-US,en;q=0.9"
+            }
+        )
+        page = await context.new_page()
+
+        await page.goto(sys.argv[1], wait_until="domcontentloaded")
+        await page.wait_for_timeout(5000)
+        print(await page.content())
+        await browser.close()
+
+asyncio.run(main())
+EOF
+}
+
+fetch() {
+    local name="$1"
+    local url_param="$2"
+
+    DIR="$XDG_DATA_HOME/scraper/raw_scraped/$name"
+    mkdir -p "$DIR"
+    if [ ! -s "$DIR/url.json" ]; then
+        local URL="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$url_param&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1$URL_PER_PAGE_240"
+        echo "{\"url\": \"$URL\"}" > "$DIR/url.json"
+    fi
+
+    URL_NEWEST="$(jq '.url' $DIR/url.json --raw-output)$URL_SORTBY_NEWLY_LISTED"
+    fetch_compress_save_html $URL_NEWEST "$DIR/$(date +%s).html.zst"
+
+    URL_ENDING="$(jq '.url' $DIR/url.json --raw-output)$URL_SORTBY_ENDING_SOONEST"
+    fetch_compress_save_html $URL_ENDING "$DIR/$(date +%s).html.zst"
+}
+
+fetch "ssd" "$URL_CATEGORY_SSD"
+fetch "minipc" "$URL_CATEGORY_MINIPC_ALLINONE"

-# Little helper to ensure we see entries in journald
-echo Done

 # If needing to do a mass compression;
 # fd '\.html$' -x zstd -z --ultra -19 -o {}.zst {}

+# If needing to purge bogus downloads
+# fd --size -100K .html.zst -x ls -lah {}
+# fd --size -100K .html.zst -x rm {}
+
 # Level compression analysis;
 #
 # A single scraped result;