Allow saving and ingesting from zstd compressed scrapes

2025-09-01 14:45:21 -04:00
7 changed files with 126 additions and 8482 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -690,6 +690,7 @@ dependencies = [
 "test-log",
 "tracing",
 "tracing-subscriber",
+ "zstd",
 ]

 [[package]]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,6 +18,7 @@ serde_json = "1.0.140"
 test-log = { version = "0.2.17", features = ["trace"] }
 tracing = { version = "0.1.41", features = ["attributes"] }
 tracing-subscriber = { version = "0.3.19", features = ["env-filter"] }
+zstd = "0.13.3"

 [dev-dependencies]
 similar-asserts = "1.7.0"
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -5,9 +5,10 @@ use crate::{
 use rayon::prelude::*;
 use serde::Deserialize;
 use serde_json;
-use std::path::Path;
 use std::time::Instant;
+use std::{io::Read, path::Path};
 use tracing::{debug, error, info};
+use zstd;

 fn timestamps_from_dir(path: &Path) -> Vec<i64> {
    if !std::fs::exists(path).expect("Directory must exist") {
@@ -20,17 +21,46 @@ fn timestamps_from_dir(path: &Path) -> Vec<i64> {
    std::fs::read_dir(path)
        .unwrap()
        .map(|fpath| fpath.unwrap().path())
-        .filter_map(|fstem| {
-            fstem
-                .file_stem()
-                .and_then(|s| s.to_str())
-                .expect("Invalid file name")
-                .parse()
-                .ok()
+        .filter_map(|fname| {
+            // Turns out file_stem() doesn't handle multiple extensions and
+            // file_prefix() is still in not stable.
+            Some(fname.file_stem()?.to_str()?.split_once('.')?.0.to_owned())
        })
+        .filter_map(|fname| fname.parse().ok())
        .collect()
 }

+fn read_timestamp_from_dir(
+    dir: &Path,
+    timestamp: &chrono::DateTime<chrono::Utc>,
+) -> Option<String> {
+    // First check for the normal html version, which we can just read straight.
+    let page_path = dir.join(format!("{}.{}", timestamp.timestamp(), "html"));
+    if page_path.exists() {
+        return std::fs::read_to_string(&page_path)
+            .inspect_err(|e| error!("Failed reading {}, error:{e}", page_path.display()))
+            .ok();
+    }
+
+    // And now if it's compresed but with zstd.
+    let page_path = dir.join(format!("{}.{}.{}", timestamp.timestamp(), "html", "zst"));
+    if page_path.exists() {
+        let f = std::fs::File::open(&page_path)
+            .inspect_err(|e| error!("Failed reading {}, error:{e}", page_path.display()))
+            .ok()?;
+        let mut s = String::new();
+        zstd::Decoder::new(f).ok()?.read_to_string(&mut s).ok()?;
+        return Some(s);
+    }
+
+    error!(
+        "Failed to lookup file for timestamp {} in {}, bailing ...",
+        timestamp,
+        dir.display()
+    );
+    None
+}
+
 pub fn parse_dir(dir: &Path, category: &str, db: &rusqlite::Connection) -> Option<usize> {
    // Ensure the category is created.
    let url_fpath = dir.join("url.json");
@@ -93,10 +123,7 @@ pub fn parse_dir(dir: &Path, category: &str, db: &rusqlite::Connection) -> Optio
                category: category.to_string(),
            };

-            let page_path = dir.join(format!("{}.html", ts.timestamp()));
-            let page_contents = std::fs::read_to_string(&page_path)
-                .inspect_err(|e| error!("Failed reading {}, error:{e}", page_path.display()))
-                .ok()?;
+            let page_contents = read_timestamp_from_dir(dir, &ts)?;
            let elements =
                parser_ebay::parse_from_ebay_page(&page_contents, &ts, &category).unwrap();
            info!(
--- a/src/parser_ebay.rs
+++ b/src/parser_ebay.rs
@@ -172,12 +172,22 @@ pub fn parse_from_ebay_page(
 mod tests {
    use super::*;
    use similar_asserts::assert_eq;
+    use std::io::Read;
+    use zstd;

    #[test_log::test]
    fn parse() {
        let timestamp = chrono::DateTime::from_timestamp(1750369463, 0).unwrap();
-        let html = include_str!("../test_data/scraper/raw_scraped/ssd/1750369463.html");
-        let parsed = parse_from_ebay_page(html, &timestamp, "ssd").unwrap();
+        let zstd = include_bytes!("../test_data/scraper/raw_scraped/ssd/1750369463.html.zst");
+        let cursor = std::io::Cursor::new(zstd);
+
+        let mut html = String::new();
+        zstd::Decoder::new(cursor)
+            .unwrap()
+            .read_to_string(&mut html)
+            .unwrap();
+
+        let parsed = parse_from_ebay_page(&html, &timestamp, "ssd").unwrap();
        // assert_eq!(parsed.len(), 62);

        let parsed = parsed.first_chunk::<10>().unwrap();
--- a/systemd/scraper_fetch.sh
+++ b/systemd/scraper_fetch.sh
@@ -5,6 +5,8 @@ URL_PER_PAGE_240="&_ipg=240"
 URL_MIN_PRICE_USD_60="&_udlo=60.00"
 URL_SEARCHTERM_NONE="&_nkw="
 URL_LOCATION_NORTHAMERICA="&LH_PrefLoc=3"
+URL_SORTBY_NEWLY_LISTED="&_sop=10"
+URL_SORTBY_ENDING_SOONEST="&_sop=1"
 URL_BASE_LISTING="https://www.ebay.com/sch/i.html?"

 if [ -z "${XDG_DATA_HOME}" ]; then
@@ -12,20 +14,85 @@ if [ -z "${XDG_DATA_HOME}" ]; then
    exit
 fi

-DIR_SSDS="$XDG_DATA_HOME/ebay_scraper/raw_scraped/ssd"
+DIR_SSDS="$XDG_DATA_HOME/scraper/raw_scraped/ssd"
 mkdir -p "$DIR_SSDS"
 if [ ! -s "$DIR_SSDS/url.json" ]; then
    URL_CATEGORY_SSD="&_sacat=175669"
-    URL_SSDS="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_SSD&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1&_sop=10$URL_PER_PAGE_240"
+    URL_SSDS="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_SSD&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1$URL_PER_PAGE_240"
    echo "{\"url\": \"$URL_SSDS\"}" > "$DIR_SSDS/url.json"
 fi
-wget -o "$DIR_SSDS/$(date +%s).html" "$(jq '.url' $DIR_SSDS/url.json)"
+curl "$(jq '.url' $DIR_SSDS/url.json --raw-output)$URL_SORTBY_NEWLY_LISTED" | zstd -z --ultra -19 -o "$DIR_SSDS/$(date +%s).html.zst"
+sleep 2
+curl "$(jq '.url' $DIR_SSDS/url.json --raw-output)$URL_SORTBY_ENDING_SOONEST" | zstd -z --ultra -19 -o "$DIR_SSDS/$(date +%s).html.zst"

-DIR_MINIPC="$XDG_DATA_HOME/ebay_scraper/raw_scraped/minipc"
+DIR_MINIPC="$XDG_DATA_HOME/scraper/raw_scraped/minipc"
 mkdir -p "$DIR_MINIPC"
 if [ ! -s "$DIR_MINIPC/url.json" ]; then
    URL_CATEGORY_MINIPC_ALLINONE="&_sacat=179"
-    URL_MINIPC="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_MINIPC_ALLINONE&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1&_sop=10$URL_PER_PAGE_240"
+    URL_MINIPC="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_MINIPC_ALLINONE&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1$URL_PER_PAGE_240"
    echo "{\"url\": \"$URL_MINIPC\"}" > "$DIR_MINIPC/url.json"
 fi
-wget -o "$DIR_MINIPC/$(date +%s).html" "$(jq '.url' $DIR_MINIPC/url.json)"
+curl "$(jq '.url' $DIR_MINIPC/url.json --raw-output)$URL_SORTBY_NEWLY_LISTED" | zstd -z --ultra -19 -o "$DIR_MINIPC/$(date +%s).html.zst"
+sleep 2
+curl "$(jq '.url' $DIR_MINIPC/url.json --raw-output)$URL_SORTBY_ENDING_SOONEST" | zstd -z --ultra -19 -o "$DIR_MINIPC/$(date +%s).html.zst"
+
+# Little helper to ensure we see entries in journald
+echo Done
+
+# If needing to do a mass compression;
+# fd '\.html$' -x zstd -z --ultra -19 -o {}.zst {}
+
+# Level compression analysis;
+#
+# A single scraped result;
+# for lvl in $(seq 3 22); zstd --compress --ultra -o 1755012328.html.zst$lvl -$lvl 1755012328.html; end
+# 1755012328.html      :  9.04%   (  2.60 MiB =>    240 KiB, 1755012328.html.zst3)
+# 1755012328.html      :  9.04%   (  2.60 MiB =>    240 KiB, 1755012328.html.zst4)
+# 1755012328.html      :  8.80%   (  2.60 MiB =>    234 KiB, 1755012328.html.zst5)
+# 1755012328.html      :  8.58%   (  2.60 MiB =>    228 KiB, 1755012328.html.zst6)
+# 1755012328.html      :  8.54%   (  2.60 MiB =>    227 KiB, 1755012328.html.zst7)
+# 1755012328.html      :  8.45%   (  2.60 MiB =>    225 KiB, 1755012328.html.zst8)
+# 1755012328.html      :  8.34%   (  2.60 MiB =>    222 KiB, 1755012328.html.zst9)
+# 1755012328.html      :  8.30%   (  2.60 MiB =>    221 KiB, 1755012328.html.zst10)
+# 1755012328.html      :  8.28%   (  2.60 MiB =>    220 KiB, 1755012328.html.zst11)
+# 1755012328.html      :  8.28%   (  2.60 MiB =>    220 KiB, 1755012328.html.zst12)
+# 1755012328.html      :  8.32%   (  2.60 MiB =>    221 KiB, 1755012328.html.zst13)
+# 1755012328.html      :  8.29%   (  2.60 MiB =>    221 KiB, 1755012328.html.zst14)
+# 1755012328.html      :  8.25%   (  2.60 MiB =>    219 KiB, 1755012328.html.zst15)
+# 1755012328.html      :  8.28%   (  2.60 MiB =>    220 KiB, 1755012328.html.zst16)
+# 1755012328.html      :  8.20%   (  2.60 MiB =>    218 KiB, 1755012328.html.zst17)
+# 1755012328.html      :  8.23%   (  2.60 MiB =>    219 KiB, 1755012328.html.zst18)
+# 1755012328.html      :  7.99%   (  2.60 MiB =>    213 KiB, 1755012328.html.zst19)
+# 1755012328.html      :  7.99%   (  2.60 MiB =>    213 KiB, 1755012328.html.zst20)
+# 1755012328.html      :  7.93%   (  2.60 MiB =>    211 KiB, 1755012328.html.zst21)
+# 1755012328.html      :  7.91%   (  2.60 MiB =>    211 KiB, 1755012328.html.zst22)
+#
+# Lets see if we get benefits tar'ing and them compressing;
+# -rw-r--r-- 1 hak8or users 2.6M Sep  1 10:11 ./1755012328.html
+# -rw-r--r-- 1 hak8or users 2.6M Sep  1 10:11 ./1755012331.html
+# -rw-r--r-- 1 hak8or users 2.6M Sep  1 10:11 ./1755015932.html
+# -rw-r--r-- 1 hak8or users 2.6M Sep  1 10:11 ./1755015929.html
+# -rw-r--r-- 1 hak8or users 2.6M Sep  1 10:11 ./1755019567.html
+# -rw-r--r-- 1 hak8or users 2.6M Sep  1 10:11 ./1755019564.html
+# -rw-r--r-- 1 hak8or users 16M Sep  1 12:23 175501.tar
+# ➜ for lvl in $(seq 3 22); zstd --compress --ultra -o 175501.tar.$lvl -$lvl 175501.tar; end
+# 175501.tar           :  8.91%   (  15.6 MiB =>   1.39 MiB, 175501.tar.3)
+# 175501.tar           :  8.92%   (  15.6 MiB =>   1.39 MiB, 175501.tar.4)
+# 175501.tar           :  8.65%   (  15.6 MiB =>   1.35 MiB, 175501.tar.5)
+# 175501.tar           :  8.42%   (  15.6 MiB =>   1.31 MiB, 175501.tar.6)
+# 175501.tar           :  8.36%   (  15.6 MiB =>   1.30 MiB, 175501.tar.7)
+# 175501.tar           :  8.25%   (  15.6 MiB =>   1.28 MiB, 175501.tar.8)
+# 175501.tar           :  5.36%   (  15.6 MiB =>    854 KiB, 175501.tar.9)
+# 175501.tar           :  5.32%   (  15.6 MiB =>    847 KiB, 175501.tar.10)
+# 175501.tar           :  5.30%   (  15.6 MiB =>    844 KiB, 175501.tar.11)
+# 175501.tar           :  5.30%   (  15.6 MiB =>    844 KiB, 175501.tar.12)
+# 175501.tar           :  5.48%   (  15.6 MiB =>    872 KiB, 175501.tar.13)
+# 175501.tar           :  5.42%   (  15.6 MiB =>    864 KiB, 175501.tar.14)
+# 175501.tar           :  5.19%   (  15.6 MiB =>    828 KiB, 175501.tar.15)
+# 175501.tar           :  5.31%   (  15.6 MiB =>    845 KiB, 175501.tar.16)
+# 175501.tar           :  5.01%   (  15.6 MiB =>    798 KiB, 175501.tar.17)
+# 175501.tar           :  5.04%   (  15.6 MiB =>    803 KiB, 175501.tar.18)
+# 175501.tar           :  4.84%   (  15.6 MiB =>    771 KiB, 175501.tar.19)
+# 175501.tar           :  4.79%   (  15.6 MiB =>    764 KiB, 175501.tar.20)
+# 175501.tar           :  4.74%   (  15.6 MiB =>    755 KiB, 175501.tar.21)
+# 175501.tar           :  4.73%   (  15.6 MiB =>    753 KiB, 175501.tar.22)
--- a/test_data/scraper/raw_scraped/ssd/1750369463.html
+++ b/test_data/scraper/raw_scraped/ssd/1750369463.html
--- a/test_data/scraper/raw_scraped/ssd/1750369463.html.zst
+++ b/test_data/scraper/raw_scraped/ssd/1750369463.html.zst