Fetch script now much more capable with headed chromium + playright
All checks were successful
Cargo Build & Test / Rust project - latest (1.86) (push) Successful in 4m29s
Cargo Build & Test / Rust project - latest (1.85.1) (push) Successful in 4m47s
Cargo Build & Test / Rust project - latest (1.87) (push) Successful in 5m7s
Cargo Build & Test / Rust project - latest (1.88) (push) Successful in 11m30s
All checks were successful
Cargo Build & Test / Rust project - latest (1.86) (push) Successful in 4m29s
Cargo Build & Test / Rust project - latest (1.85.1) (push) Successful in 4m47s
Cargo Build & Test / Rust project - latest (1.87) (push) Successful in 5m7s
Cargo Build & Test / Rust project - latest (1.88) (push) Successful in 11m30s
This commit is contained in:
@@ -7,6 +7,8 @@ URL_SEARCHTERM_NONE="&_nkw="
|
||||
URL_LOCATION_NORTHAMERICA="&LH_PrefLoc=3"
|
||||
URL_SORTBY_NEWLY_LISTED="&_sop=10"
|
||||
URL_SORTBY_ENDING_SOONEST="&_sop=1"
|
||||
URL_CATEGORY_MINIPC_ALLINONE="&_sacat=179"
|
||||
URL_CATEGORY_SSD="&_sacat=175669"
|
||||
URL_BASE_LISTING="https://www.ebay.com/sch/i.html?"
|
||||
|
||||
if [ -z "${XDG_DATA_HOME}" ]; then
|
||||
@@ -14,34 +16,90 @@ if [ -z "${XDG_DATA_HOME}" ]; then
|
||||
exit
|
||||
fi
|
||||
|
||||
DIR_SSDS="$XDG_DATA_HOME/scraper/raw_scraped/ssd"
|
||||
mkdir -p "$DIR_SSDS"
|
||||
if [ ! -s "$DIR_SSDS/url.json" ]; then
|
||||
URL_CATEGORY_SSD="&_sacat=175669"
|
||||
URL_SSDS="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_SSD&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1$URL_PER_PAGE_240"
|
||||
echo "{\"url\": \"$URL_SSDS\"}" > "$DIR_SSDS/url.json"
|
||||
fi
|
||||
curl "$(jq '.url' $DIR_SSDS/url.json --raw-output)$URL_SORTBY_NEWLY_LISTED" | zstd -z --ultra -19 -o "$DIR_SSDS/$(date +%s).html.zst"
|
||||
sleep 2
|
||||
curl "$(jq '.url' $DIR_SSDS/url.json --raw-output)$URL_SORTBY_ENDING_SOONEST" | zstd -z --ultra -19 -o "$DIR_SSDS/$(date +%s).html.zst"
|
||||
# Heh, so Ebay started to block my scraping efforts after a while. I couldn't
|
||||
# get this working with wget, so in the end I decided to go for this route which
|
||||
# is quite ugly, but should work in the end. We effectively run a non headless
|
||||
# version of a browser with various realistic headers and screen dimensions.
|
||||
# Lastly, we give the page an extra 5 seconds to run any potential javascript
|
||||
# often used to counter scraping or bots.
|
||||
fetch_compress_save_html() {
|
||||
local url="$1"
|
||||
local output_file="$2"
|
||||
|
||||
DIR_MINIPC="$XDG_DATA_HOME/scraper/raw_scraped/minipc"
|
||||
mkdir -p "$DIR_MINIPC"
|
||||
if [ ! -s "$DIR_MINIPC/url.json" ]; then
|
||||
URL_CATEGORY_MINIPC_ALLINONE="&_sacat=179"
|
||||
URL_MINIPC="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_MINIPC_ALLINONE&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1$URL_PER_PAGE_240"
|
||||
echo "{\"url\": \"$URL_MINIPC\"}" > "$DIR_MINIPC/url.json"
|
||||
fi
|
||||
curl "$(jq '.url' $DIR_MINIPC/url.json --raw-output)$URL_SORTBY_NEWLY_LISTED" | zstd -z --ultra -19 -o "$DIR_MINIPC/$(date +%s).html.zst"
|
||||
sleep 2
|
||||
curl "$(jq '.url' $DIR_MINIPC/url.json --raw-output)$URL_SORTBY_ENDING_SOONEST" | zstd -z --ultra -19 -o "$DIR_MINIPC/$(date +%s).html.zst"
|
||||
echo Fetching $url
|
||||
xvfb-run --server-args="-screen 0 1024x768x24" \
|
||||
uv run --with playwright --with playwright-stealth - $url <<'EOF' | zstd -z --ultra -19 -o $output_file
|
||||
import asyncio
|
||||
import sys
|
||||
from playwright.async_api import async_playwright
|
||||
from playwright_stealth import Stealth
|
||||
|
||||
async def main():
|
||||
async with Stealth().use_async(async_playwright()) as p:
|
||||
browser = await p.chromium.launch(
|
||||
executable_path='/usr/bin/chromium',
|
||||
args=[
|
||||
'--no-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
"--window-size=1901,1018"
|
||||
],
|
||||
headless=False
|
||||
)
|
||||
# Create context with user agent
|
||||
context = await browser.new_context(
|
||||
color_scheme=r"light",
|
||||
locale=r"en-US,en;q=0.9",
|
||||
user_agent=r"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36",
|
||||
timezone_id=r"America/New_York",
|
||||
extra_http_headers={
|
||||
"origin": "https://www.ebay.com",
|
||||
"accept": "*/*",
|
||||
"accept-encoding": "gzip, deflate, br, zstd",
|
||||
"cache-control": "no-cache",
|
||||
"accept-language": "en-US,en;q=0.9"
|
||||
}
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
await page.goto(sys.argv[1], wait_until="domcontentloaded")
|
||||
await page.wait_for_timeout(5000)
|
||||
print(await page.content())
|
||||
await browser.close()
|
||||
|
||||
asyncio.run(main())
|
||||
EOF
|
||||
}
|
||||
|
||||
fetch() {
|
||||
local name="$1"
|
||||
local url_param="$2"
|
||||
|
||||
DIR="$XDG_DATA_HOME/scraper/raw_scraped/$name"
|
||||
mkdir -p "$DIR"
|
||||
if [ ! -s "$DIR/url.json" ]; then
|
||||
local URL="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$url_param&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1$URL_PER_PAGE_240"
|
||||
echo "{\"url\": \"$URL\"}" > "$DIR/url.json"
|
||||
fi
|
||||
|
||||
URL_NEWEST="$(jq '.url' $DIR/url.json --raw-output)$URL_SORTBY_NEWLY_LISTED"
|
||||
fetch_compress_save_html $URL_NEWEST "$DIR/$(date +%s).html.zst"
|
||||
|
||||
URL_ENDING="$(jq '.url' $DIR/url.json --raw-output)$URL_SORTBY_ENDING_SOONEST"
|
||||
fetch_compress_save_html $URL_ENDING "$DIR/$(date +%s).html.zst"
|
||||
}
|
||||
|
||||
fetch "ssd" "$URL_CATEGORY_SSD"
|
||||
fetch "minipc" "$URL_CATEGORY_MINIPC_ALLINONE"
|
||||
|
||||
# Little helper to ensure we see entries in journald
|
||||
echo Done
|
||||
|
||||
# If needing to do a mass compression;
|
||||
# fd '\.html$' -x zstd -z --ultra -19 -o {}.zst {}
|
||||
|
||||
# If needing to purge bogus downloads
|
||||
# fd --size -100K .html.zst -x ls -lah {}
|
||||
# fd --size -100K .html.zst -x rm {}
|
||||
|
||||
# Level compression analysis;
|
||||
#
|
||||
# A single scraped result;
|
||||
|
Reference in New Issue
Block a user