Allow saving and ingesting from zstd compressed scrapes

2025-09-01 14:45:21 -04:00
11 changed files with 218 additions and 9118 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -680,7 +680,6 @@ dependencies = [
 "clap",
 "dirs",
 "lazy_static",
 "liblzma",
 "rayon",
 "regex",
 "rusqlite",
@@ -688,10 +687,10 @@ dependencies = [
 "serde",
 "serde_json",
 "similar-asserts",
 "strum",
 "test-log",
 "tracing",
 "tracing-subscriber",
 "zstd",
 ]
 [[package]]
@@ -1163,26 +1162,6 @@ version = "0.2.174"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776"
 [[package]]
 name = "liblzma"
 version = "0.4.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0791ab7e08ccc8e0ce893f6906eb2703ed8739d8e89b57c0714e71bad09024c8"
 dependencies = [
 "liblzma-sys",
 ]
 [[package]]
 name = "liblzma-sys"
 version = "0.4.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "01b9596486f6d60c3bbe644c0e1be1aa6ccc472ad630fe8927b456973d7cb736"
 dependencies = [
 "cc",
 "libc",
 "pkg-config",
 ]
 [[package]]
 name = "libredox"
 version = "0.1.3"
@@ -1913,29 +1892,6 @@ version = "0.11.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
 [[package]]
 name = "strum"
 version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f64def088c51c9510a8579e3c5d67c65349dcf755e5479ad3d010aa6454e2c32"
 dependencies = [
 "phf",
 "strum_macros",
 ]
 [[package]]
 name = "strum_macros"
 version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c77a8c5abcaf0f9ce05d62342b7d298c346515365c36b673df4ebe3ced01fde8"
 dependencies = [
 "heck",
 "proc-macro2",
 "quote",
 "rustversion",
 "syn",
 ]
 [[package]]
 name = "syn"
 version = "2.0.103"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,17 +9,16 @@ chrono = { version = "0.4.41", features = ["serde"] }
 clap = { version = "4.5.40", features = ["derive"] }
 dirs = "6.0.0"
 lazy_static = "1.5.0"
 liblzma = "0.4.2"
 rayon = "1.10.0"
 regex = "1.11.1"
 rusqlite = { version = "0.36.0", features = ["bundled", "chrono"] }
 scraper = "0.23.1"
 serde = { version = "1.0.219", features = ["derive"] }
 serde_json = "1.0.140"
 strum = { version = "0.27.1", features = ["std", "derive", "phf", "strum_macros"] }
 test-log = { version = "0.2.17", features = ["trace"] }
 tracing = { version = "0.1.41", features = ["attributes"] }
 tracing-subscriber = { version = "0.3.19", features = ["env-filter"] }
 zstd = "0.13.3"
 [dev-dependencies]
 similar-asserts = "1.7.0"
--- a/readme.md
+++ b/readme.md
@@ -3,352 +3,10 @@
 This is a dumb little tool which ingests raw HTML files, does some parsing on them, and serves the results over a web API.
 ```bash
-export URL_BASE="localhost:9876"; \
+export URL_BASE="http://scraper.homelab.hak8or.com:8080"; \
-echo run0 && http POST "$URL_BASE/category/ssd/discover" && \
+echo run0 && http POST "$URL_BASE/page/parse/ssd" && \
-echo run1 && http POST "$URL_BASE/category/ssd/parse" && \
+echo run1 && http POST "$URL_BASE/listing/parse" && \
-echo run2 && http GET "$URL_BASE/category/ssd/parse" && \
+echo run2 && http GET "$URL_BASE/listing/since/12345678/2" && \
-echo run3 && http POST "$URL_BASE/listing/parse" && \
+echo run3 && http GET "$URL_BASE/listing/388484391867" && \
-echo run4 && http GET "$URL_BASE/listings" since:=10099 limit:=10 cents_per_tbytes_max:=900 && \
+echo run4 && http GET "$URL_BASE/listing/286605201240/history"
 echo run5 && http GET "$URL_BASE/listing/267267322597" && \
 echo run6 && http GET "$URL_BASE/listing/267267322597/history" &&
 echo run7 && http GET "$URL_BASE/listing/267267322597/parsed"
 ```
 ```
 run0
 HTTP/1.1 200 OK
 content-length: 0
 content-type: text/plain; charset=utf-8
 date: Thu, 10 Jul 2025 04:26:49 GMT
 run1
 HTTP/1.1 200 OK
 content-length: 0
 content-type: text/plain; charset=utf-8
 date: Thu, 10 Jul 2025 04:26:49 GMT
 run2
 HTTP/1.1 200 OK
 content-length: 36
 content-type: application/json
 date: Thu, 10 Jul 2025 04:26:49 GMT
 [
    [
        "PendingParse",
        1,
        1
    ],
    [
        "Ready",
        0,
        1
    ]
 ]
 run3
 HTTP/1.1 200 OK
 content-length: 2
 content-type: application/json
 date: Thu, 10 Jul 2025 04:26:49 GMT
 62
 run4
 HTTP/1.1 200 OK
 content-length: 4232
 content-type: application/json
 date: Thu, 10 Jul 2025 04:26:49 GMT
 [
    {
        "history": [
            {
                "category": "ssd",
                "current_bid_usd_cents": 1260,
                "item": 286605201240,
                "timestamp": "2025-06-19T21:44:23Z"
            }
        ],
        "listing": {
            "buy_it_now_price_cents": null,
            "has_best_offer": true,
            "id": 5,
            "image_url": "https://i.ebayimg.com/images/g/3NoAAeSwPrtoDb1O/s-l500.webp",
            "item_id": 286605201240,
            "title": "Fanxiang M.2 SSD 1TB NVMe PCIe Gen 3x 4 M2 Internal Solid State Drive 3500MB/s"
        },
        "parsed": [
            {
                "id": 5,
                "individual_size_gigabytes": 1024,
                "item": 286605201240,
                "needed_description_check": false,
                "parse_engine": 0,
                "quantity": 1,
                "total_gigabytes": 1024
            }
        ]
    },
    {
        "history": [
            {
                "category": "ssd",
                "current_bid_usd_cents": 2400,
                "item": 177133381123,
                "timestamp": "2025-06-19T21:44:23Z"
            }
        ],
        "listing": {
            "buy_it_now_price_cents": null,
            "has_best_offer": false,
            "id": 22,
            "image_url": "https://i.ebayimg.com/images/g/-VMAAOSwaX1oNyx4/s-l500.webp",
            "item_id": 177133381123,
            "title": "SanDisk professional G-DRIVE SSD 2TB, A+ condition"
        },
        "parsed": [
            {
                "id": 22,
                "individual_size_gigabytes": 2048,
                "item": 177133381123,
                "needed_description_check": false,
                "parse_engine": 0,
                "quantity": 1,
                "total_gigabytes": 2048
            }
        ]
    },
    {
        "history": [
            {
                "category": "ssd",
                "current_bid_usd_cents": 3108,
                "item": 187263467837,
                "timestamp": "2025-06-19T21:44:23Z"
            }
        ],
        "listing": {
            "buy_it_now_price_cents": null,
            "has_best_offer": false,
            "id": 35,
            "image_url": "https://i.ebayimg.com/images/g/hn8AAOSw1hJoNrJm/s-l500.webp",
            "item_id": 187263467837,
            "title": "Used Fanxiang S880 4TB SSD NVME M.2 SSD PCIe 4x4 7300MBS Solid State Drive"
        },
        "parsed": [
            {
                "id": 35,
                "individual_size_gigabytes": 4096,
                "item": 187263467837,
                "needed_description_check": false,
                "parse_engine": 0,
                "quantity": 1,
                "total_gigabytes": 4096
            }
        ]
    },
    {
        "history": [
            {
                "category": "ssd",
                "current_bid_usd_cents": 1000,
                "item": 267267367821,
                "timestamp": "2025-06-19T21:44:23Z"
            }
        ],
        "listing": {
            "buy_it_now_price_cents": null,
            "has_best_offer": false,
            "id": 37,
            "image_url": "https://i.ebayimg.com/images/g/Cr8AAOSwXY1oN6m8/s-l500.webp",
            "item_id": 267267367821,
            "title": "(Lot of 6) Samsung MZ-VLB2560 256GB M.2 NVMe Internal SSD (MZVLB256HBHQ-000H1)"
        },
        "parsed": [
            {
                "id": 37,
                "individual_size_gigabytes": 256,
                "item": 267267367821,
                "needed_description_check": false,
                "parse_engine": 0,
                "quantity": 6,
                "total_gigabytes": 1536
            }
        ]
    },
    {
        "history": [
            {
                "category": "ssd",
                "current_bid_usd_cents": 4600,
                "item": 187263491149,
                "timestamp": "2025-06-19T21:44:23Z"
            }
        ],
        "listing": {
            "buy_it_now_price_cents": null,
            "has_best_offer": false,
            "id": 44,
            "image_url": "https://i.ebayimg.com/images/g/v2EAAOSwg9poNrTr/s-l500.webp",
            "item_id": 187263491149,
            "title": "Used Silicon Power 4TB US75 Nvme PCIe Gen4x4 M.2 2280 SSD R/W Up to 7000/6500 MB"
        },
        "parsed": [
            {
                "id": 44,
                "individual_size_gigabytes": 4096,
                "item": 187263491149,
                "needed_description_check": false,
                "parse_engine": 0,
                "quantity": 1,
                "total_gigabytes": 4096
            }
        ]
    },
    {
        "history": [
            {
                "category": "ssd",
                "current_bid_usd_cents": 1000,
                "item": 267267351339,
                "timestamp": "2025-06-19T21:44:23Z"
            }
        ],
        "listing": {
            "buy_it_now_price_cents": null,
            "has_best_offer": false,
            "id": 46,
            "image_url": "https://i.ebayimg.com/images/g/z8EAAOSwyKZoN6TW/s-l500.webp",
            "item_id": 267267351339,
            "title": "(Lot of 6) Used -Micron MTFDDAV256TBN 256GB, M.2 2280 Solid State Drive"
        },
        "parsed": [
            {
                "id": 46,
                "individual_size_gigabytes": 256,
                "item": 267267351339,
                "needed_description_check": false,
                "parse_engine": 0,
                "quantity": 6,
                "total_gigabytes": 1536
            }
        ]
    },
    {
        "history": [
            {
                "category": "ssd",
                "current_bid_usd_cents": 99,
                "item": 306325087069,
                "timestamp": "2025-06-19T21:44:23Z"
            }
        ],
        "listing": {
            "buy_it_now_price_cents": null,
            "has_best_offer": false,
            "id": 59,
            "image_url": "https://i.ebayimg.com/images/g/zuUAAOSwIoJoN5yC/s-l500.webp",
            "item_id": 306325087069,
            "title": "T298 ~ HP OEM Desktop Z240 Workstation Heatsink w NVMe M.2 256GB SSD 826414-001"
        },
        "parsed": [
            {
                "id": 59,
                "individual_size_gigabytes": 256,
                "item": 306325087069,
                "needed_description_check": false,
                "parse_engine": 0,
                "quantity": 1,
                "total_gigabytes": 256
            }
        ]
    },
    {
        "history": [
            {
                "category": "ssd",
                "current_bid_usd_cents": 1000,
                "item": 267267322597,
                "timestamp": "2025-06-19T21:44:23Z"
            }
        ],
        "listing": {
            "buy_it_now_price_cents": null,
            "has_best_offer": false,
            "id": 60,
            "image_url": "https://i.ebayimg.com/images/g/r8YAAOSwlkdoN5uW/s-l500.webp",
            "item_id": 267267322597,
            "title": "(Lot of 5) Used - Micro 1100 256GB SATA III 2.5\" SSD MTFDDAK256TBN"
        },
        "parsed": [
            {
                "id": 60,
                "individual_size_gigabytes": 256,
                "item": 267267322597,
                "needed_description_check": false,
                "parse_engine": 0,
                "quantity": 5,
                "total_gigabytes": 1280
            }
        ]
    }
 ]
 run5
 HTTP/1.1 200 OK
 content-length: 237
 content-type: application/json
 date: Thu, 10 Jul 2025 04:26:49 GMT
 {
    "buy_it_now_price_cents": null,
    "has_best_offer": false,
    "id": 60,
    "image_url": "https://i.ebayimg.com/images/g/r8YAAOSwlkdoN5uW/s-l500.webp",
    "item_id": 267267322597,
    "title": "(Lot of 5) Used - Micro 1100 256GB SATA III 2.5\" SSD MTFDDAK256TBN"
 }
 run6
 HTTP/1.1 200 OK
 content-length: 62
 content-type: application/json
 date: Thu, 10 Jul 2025 04:26:50 GMT
 [
    {
        "current_bid_usd_cents": 1000,
        "when": "2025-06-19T21:44:23Z"
    }
 ]
 run7
 HTTP/1.1 200 OK
 content-length: 149
 content-type: application/json
 date: Thu, 10 Jul 2025 04:26:50 GMT
 [
    {
        "id": 60,
        "individual_size_gigabytes": 256,
        "item": 267267322597,
        "needed_description_check": false,
        "parse_engine": 0,
        "quantity": 5,
        "total_gigabytes": 1280
    }
 ]
 ```
--- a/src/db.rs
+++ b/src/db.rs
@@ -3,8 +3,6 @@ use rusqlite::Connection;
 use serde::Deserialize;
 use serde::Serialize;
 use std::path::Path;
 use strum::{EnumIter, IntoEnumIterator};
 // use strum_macros::EnumIter;
 use tracing::{error, info};
 pub trait DBTable {
@@ -142,54 +140,30 @@ impl SearchURL {
    }
 }
 #[derive(Serialize, Debug, PartialEq, Clone, EnumIter)]
 pub enum ParsedPageStatus {
    PendingParse,
    Ready,
 }
 impl TryFrom<i64> for ParsedPageStatus {
    type Error = rusqlite::Error;
    fn try_from(value: i64) -> Result<Self, Self::Error> {
        match value {
            0 => Ok(ParsedPageStatus::PendingParse),
            1 => Ok(ParsedPageStatus::Ready),
            _ => Err(rusqlite::Error::InvalidColumnType(
                2,
                "Invalid integer of {} for ParsedPageStatus".to_string(),
                rusqlite::types::Type::Integer,
            )),
        }
    }
 }
 #[derive(Serialize, Debug, PartialEq, Clone)]
-pub struct Page {
+pub struct ParsedPage {
    pub timestamp: DateTime<Utc>,
    pub category: String,
    pub status: ParsedPageStatus,
 }
-impl DBTable for Page {
+impl DBTable for ParsedPage {
-    const TABLE_NAME: &'static str = "Pages";
+    const TABLE_NAME: &'static str = "Pages_Parsed";
    const TABLE_SCHEMA: &'static str = "
        id                INTEGER PRIMARY KEY,
        category          TEXT NOT NULL,
        timestamp         INTEGER NOT NULL,
        status            INTEGER NOT NULL,
        UNIQUE(category, timestamp)
        FOREIGN KEY(category) REFERENCES SearchURLs(name)
    ";
    fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> {
        let mut stmt = conn.prepare(&format!(
-            "SELECT category, timestamp, status FROM {}",
+            "SELECT category, timestamp FROM {}",
            Self::TABLE_NAME
        ))?;
        let iter = stmt.query_map([], |row| {
-            Ok(Page {
+            Ok(ParsedPage {
                category: row.get(0)?,
                timestamp: row.get(1)?,
                status: row.get::<_, i64>(2)?.try_into().unwrap(),
            })
        })?;
@@ -200,7 +174,7 @@ impl DBTable for Page {
        Ok(result)
    }
 }
-impl Page {
+impl ParsedPage {
    pub fn lookup(conn: &Connection, timestamp: DateTime<Utc>) -> Option<Self> {
        let mut stmt = conn
            .prepare(&format!(
@@ -209,11 +183,10 @@ impl Page {
            ))
            .ok()?;
        stmt.query_one([timestamp], |row| {
-            Ok(Page {
+            Ok(ParsedPage {
                // id: row.get(0)?,
                category: row.get(1)?,
                timestamp: row.get(2)?,
                status: row.get::<_, i64>(3)?.try_into().unwrap(),
            })
        })
        .ok()
@@ -223,70 +196,13 @@ impl Page {
        let _ = conn
            .execute(
                &format!(
-                    "INSERT OR REPLACE INTO {} (category, timestamp, status) VALUES (?1, ?2, ?3)",
+                    "INSERT OR REPLACE INTO {} (category, timestamp) VALUES (?1, ?2)",
                    Self::TABLE_NAME
                ),
-                (&self.category, self.timestamp, self.status.clone() as i64),
+                (&self.category, self.timestamp),
            )
            .unwrap();
    }
    pub fn lookup_status(
        conn: &Connection,
        status: ParsedPageStatus,
        category: &str,
        max: usize,
    ) -> Vec<Self> {
        let mut stmt = conn
            .prepare(&format!(
                "SELECT category, timestamp, status FROM {} WHERE status = {} AND category = ?1 LIMIT {}",
                Self::TABLE_NAME,
                status.clone() as i64,
                max
            ))
            .unwrap();
        stmt.query_map([category], |row| {
            Ok(Self {
                category: row.get(0)?,
                timestamp: row.get(1)?,
                status: row.get::<_, i64>(2)?.try_into().unwrap(),
            })
        })
        .unwrap()
        .inspect(|e| info!("debugging saw {:?}", e))
        .filter_map(|e| e.ok())
        .collect()
    }
    pub fn category_stats(conn: &Connection, category: &str) -> Vec<(ParsedPageStatus, i64, i64)> {
        let mut res: Vec<(ParsedPageStatus, i64, i64)> = vec![];
        for status in ParsedPageStatus::iter() {
            let cnt_category_status = conn
                .prepare(&format!(
                    "SELECT COUNT(*) FROM {} WHERE category = ?1 AND status = {}",
                    Self::TABLE_NAME,
                    status.clone() as i64
                ))
                .ok()
                .unwrap()
                .query_one([category], |r| r.get(0))
                .inspect_err(|e| error!("Failed to get count due to error\"{:?}\", returning 0", e))
                .unwrap_or(0);
            let cnt_category_total = conn
                .prepare(&format!(
                    "SELECT COUNT(*) FROM {} WHERE category = ?1",
                    Self::TABLE_NAME
                ))
                .ok()
                .unwrap()
                .query_one([category], |r| r.get(0))
                .inspect_err(|e| error!("Failed to get count due to error\"{:?}\", returning 0", e))
                .unwrap_or(0);
            res.push((status, cnt_category_status, cnt_category_total));
        }
        res
    }
 }
 #[derive(Serialize, Debug, PartialEq, Copy, Clone)]
@@ -402,7 +318,7 @@ impl DBTable for ItemAppearances {
        current_bid_usd_cents   INTEGER,
        UNIQUE(item, timestamp),
        FOREIGN KEY(item)       REFERENCES Listings(item_id),
-        FOREIGN KEY(category, timestamp) REFERENCES Pages(category, timestamp)
+        FOREIGN KEY(category, timestamp) REFERENCES Pages_Parsed(category, timestamp)
    ";
    fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> {
@@ -708,7 +624,7 @@ pub fn get_initialized(path: Option<&Path>) -> Connection {
    SearchURL::initialize(&conn);
    Listing::initialize(&conn);
    ParsedStorage::initialize(&conn);
-    Page::initialize(&conn);
+    ParsedPage::initialize(&conn);
    ItemAppearances::initialize(&conn);
    conn
@@ -728,7 +644,7 @@ pub fn get_stats(conn: &Connection) -> Stats {
        rows_search_url: SearchURL::get_count(conn),
        rows_listing: Listing::get_count(conn),
        rows_parsed_storage: ParsedStorage::get_count(conn),
-        rows_parsed_page: Page::get_count(conn),
+        rows_parsed_page: ParsedPage::get_count(conn),
        rows_item_appearances: ItemAppearances::get_count(conn),
    }
 }
@@ -771,28 +687,12 @@ mod tests {
        parsed.add_or_update(&db);
        assert_eq!(ParsedStorage::lookup(&db, listing.item_id), vec![parsed]);
-        let page = Page {
+        let page = ParsedPage {
            category: "ssd".to_owned(),
            timestamp: std::time::SystemTime::now().into(),
            status: ParsedPageStatus::PendingParse,
        };
        page.add_or_update(&db);
-        assert_eq!(Page::lookup(&db, page.timestamp), Some(page.clone()));
+        assert_eq!(ParsedPage::lookup(&db, page.timestamp), Some(page.clone()));
        assert_eq!(
            Page::lookup_status(&db, ParsedPageStatus::PendingParse, "ssd", 10),
            vec![page.clone()]
        );
        assert_eq!(
            Page::lookup_status(&db, ParsedPageStatus::Ready, "ssd", 10),
            vec![]
        );
        assert_eq!(
            Page::category_stats(&db, "ssd"),
            vec![
                (ParsedPageStatus::PendingParse, 1, 1),
                (ParsedPageStatus::Ready, 0, 1)
            ]
        );
        let apperance = ItemAppearances {
            item: listing.item_id,
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -2,4 +2,3 @@ pub mod db;
 pub mod parser;
 pub mod parser_ebay;
 pub mod parser_storage;
 pub mod xdg_dirs;
--- a/src/main.rs
+++ b/src/main.rs
@@ -1,14 +1,12 @@
-use actix_web::{App, HttpServer, Responder, Result, get, post, rt, web, web::Data};
+use actix_web::{App, HttpServer, Responder, Result, get, post, web, web::Data};
 use chrono::{DateTime, Utc};
 use clap::Parser;
-use ebay_scraper_rust::db;
+use ebay_scraper_rust::db::{
-use ebay_scraper_rust::db::DBTable;
+    DBTable, ItemAppearances, Listing, ParsedPage, ParsedStorage, SearchURL, get_initialized,
-use ebay_scraper_rust::db::Page;
+    get_stats, listings_get_filtered,
-use ebay_scraper_rust::parser;
+};
 use ebay_scraper_rust::parser::parse_dir;
 use ebay_scraper_rust::parser_storage;
 use ebay_scraper_rust::xdg_dirs;
 // use rt::mpsc;
 // use rt::time::timeout;
 use serde::{Deserialize, Serialize};
 use std::path::PathBuf;
 use std::sync::Mutex;
@@ -20,6 +18,8 @@ use tracing_subscriber::fmt;
 use tracing_subscriber::prelude::__tracing_subscriber_SubscriberExt;
 use tracing_subscriber::util::SubscriberInitExt;
 mod xdg_dirs;
 #[derive(Parser, Debug)]
 #[clap(
    name = "ebay-scraper-rust",
@@ -41,7 +41,7 @@ async fn listings_filtered_get(
    filter: web::Query<ListingsFilter>,
 ) -> Result<impl Responder> {
    let start = Instant::now();
-    let res = db::listings_get_filtered(
+    let res = listings_get_filtered(
        &db.lock().unwrap(),
        &DateTime::<Utc>::from_timestamp(filter.since.unwrap_or(0), 0).unwrap(),
        filter.limit.unwrap_or(1_000),
@@ -61,7 +61,7 @@ async fn listing_get(
    db: Data<Mutex<rusqlite::Connection>>,
    id: web::Path<i64>,
 ) -> Result<impl Responder> {
-    Ok(web::Json(db::Listing::lookup(&db.lock().unwrap(), *id)))
+    Ok(web::Json(Listing::lookup(&db.lock().unwrap(), *id)))
 }
 #[get("/listing/{id}/parsed")]
@@ -69,10 +69,7 @@ async fn listing_parse_get(
    db: Data<Mutex<rusqlite::Connection>>,
    id: web::Path<i64>,
 ) -> Result<impl Responder> {
-    Ok(web::Json(db::ParsedStorage::lookup(
+    Ok(web::Json(ParsedStorage::lookup(&db.lock().unwrap(), *id)))
        &db.lock().unwrap(),
        *id,
    )))
 }
 #[derive(Serialize)]
@@ -86,7 +83,7 @@ async fn listing_history_get(
    db: Data<Mutex<rusqlite::Connection>>,
    id: web::Path<i64>,
 ) -> Result<impl Responder> {
-    let history: Vec<_> = db::ItemAppearances::lookup(&db.lock().unwrap(), *id)
+    let history: Vec<_> = ItemAppearances::lookup(&db.lock().unwrap(), *id)
        .iter()
        // .inspect(|e| info!("got: {:?}", e))
        .filter_map(|e| {
@@ -103,7 +100,7 @@ async fn listing_history_get(
 async fn parse_listings(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
    let mut cnt = 0;
    let db_unlocked = db.lock().unwrap();
-    db::Listing::lookup_non_parsed(&db_unlocked)
+    Listing::lookup_non_parsed(&db_unlocked)
        .iter()
        .map(|l| parser_storage::parse_size_and_quantity(l.0, &l.1))
        .inspect(|_| cnt = cnt + 1)
@@ -114,33 +111,7 @@ async fn parse_listings(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Re
 #[get("/category")]
 async fn category_getnames(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
-    Ok(web::Json(db::SearchURL::names(&db.lock().unwrap())))
+    Ok(web::Json(SearchURL::names(&db.lock().unwrap())))
 }
 async fn category_discover_worker(
    db: Data<Mutex<rusqlite::Connection>>,
    downloaddir: Data<PathBuf>,
    category: web::Path<String>,
 ) {
 }
 #[post("/category/{category}/discover")]
 #[instrument(skip_all)]
 async fn category_discover(
    db: Data<Mutex<rusqlite::Connection>>,
    downloaddir: Data<PathBuf>,
    category: web::Path<String>,
 ) -> Result<impl Responder> {
    let start = Instant::now();
    parser::add_pages(
        &db.lock().unwrap(),
        &downloaddir.join(category.clone()),
        &category,
    );
    let elapsed = start.elapsed().as_micros() as f64 / 1000.0;
    info!("Added many pages to the category, took {elapsed} ms.");
    Ok("")
 }
 #[post("/category/{category}/parse")]
@@ -149,41 +120,34 @@ async fn category_parse(
    db: Data<Mutex<rusqlite::Connection>>,
    downloaddir: Data<PathBuf>,
    category: web::Path<String>,
 ) -> Result<impl Responder> {
    parser::parse_pages(&db.lock().unwrap(), &downloaddir, &category, 100);
    Ok("")
 }
 #[get("/category/{category}/parse")]
 #[instrument(skip_all)]
 async fn category_parse_get(
    db: Data<Mutex<rusqlite::Connection>>,
    category: web::Path<String>,
 ) -> Result<impl Responder> {
    let start = Instant::now();
-    let stats = Page::category_stats(&db.lock().unwrap(), &category);
+    let count = parse_dir(
-    stats
+        &downloaddir.join(category.clone()),
-        .iter()
+        &category,
-        .for_each(|(status, cnt, total)| info!("{:?} {} {}", status, cnt, total));
+        &db.lock().unwrap(),
    )
    .unwrap();
    let elapsed = start.elapsed().as_micros() as f64 / 1000.0;
-    info!("Found, took {elapsed} ms.");
+
-    Ok(web::Json(stats))
+    info!("Added {count} listings, took {elapsed} ms.");
    Ok(count.to_string())
 }
 #[get("/stats")]
 async fn stats_get(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
-    Ok(web::Json(db::get_stats(&db.lock().unwrap())))
+    Ok(web::Json(get_stats(&db.lock().unwrap())))
 }
 #[get("/admin")]
 async fn admin_get(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
    let db = db.lock().unwrap();
    let query_start_time = Instant::now();
-    let search_urls = db::SearchURL::get_all(&db).unwrap_or_default();
+    let search_urls = SearchURL::get_all(&db).unwrap_or_default();
-    let parsed_pages = db::Page::get_all(&db).unwrap_or_default();
+    let parsed_pages = ParsedPage::get_all(&db).unwrap_or_default();
-    let parsed_storages = db::ParsedStorage::get_all(&db).unwrap_or_default();
+    let parsed_storages = ParsedStorage::get_all(&db).unwrap_or_default();
-    let item_appearances = db::ItemAppearances::get_all(&db).unwrap_or_default();
+    let item_appearances = ItemAppearances::get_all(&db).unwrap_or_default();
-    let listings = db::Listing::get_all(&db).unwrap_or_default();
+    let listings = Listing::get_all(&db).unwrap_or_default();
    let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
    let html_gen_start_time = Instant::now();
@@ -310,10 +274,6 @@ fn generate_table<T: Serialize>(title: &str, data: &[T]) -> String {
    )
 }
 async fn pages_pickup() -> std::io::Result<()> {
    Ok(())
 }
 #[actix_web::main]
 async fn main() -> std::io::Result<()> {
    tracing_subscriber::registry()
@@ -327,10 +287,10 @@ async fn main() -> std::io::Result<()> {
        "Starting with scraped data dir of \"{}\".",
        scrapedatadir.to_str().unwrap()
    );
-    let db_mutex = Data::new(Mutex::new(db::get_initialized(None)));
+    let db_mutex = Data::new(Mutex::new(get_initialized(None)));
    // Prepare our backend via pulling in what catagories we are preconfigured with.
-    db::SearchURL::scan(&db_mutex.lock().unwrap(), &scrapedatadir, "url.json");
+    SearchURL::scan(&db_mutex.lock().unwrap(), &scrapedatadir, "url.json");
    HttpServer::new(move || {
        App::new()
@@ -338,13 +298,10 @@ async fn main() -> std::io::Result<()> {
            .service(listing_get)
            .service(listings_filtered_get)
            .service(listing_history_get)
            .service(listing_parse_get)
            // Category handlers
            .service(parse_listings)
            .service(category_parse)
            .service(category_discover)
            .service(category_getnames)
            .service(category_parse_get)
            // Gnarly info dump
            .service(admin_get)
            .service(stats_get)
@@ -355,6 +312,4 @@ async fn main() -> std::io::Result<()> {
    .bind(("0.0.0.0", 9876))?
    .run()
    .await
    // tokio::join!(server, pages_pickup)
 }
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -1,13 +1,14 @@
 use crate::{
-    db::{Page, ParsedPageStatus, SearchURL},
+    db::{ParsedPage, SearchURL},
    parser_ebay,
 };
 use rayon::prelude::*;
 use serde::Deserialize;
 use serde_json;
 use std::path::Path;
 use std::time::Instant;
 use std::{io::Read, path::Path};
 use tracing::{debug, error, info};
 use zstd;
 fn timestamps_from_dir(path: &Path) -> Vec<i64> {
    if !std::fs::exists(path).expect("Directory must exist") {
@@ -20,23 +21,52 @@ fn timestamps_from_dir(path: &Path) -> Vec<i64> {
    std::fs::read_dir(path)
        .unwrap()
        .map(|fpath| fpath.unwrap().path())
-        .filter_map(|fstem| {
+        .filter_map(|fname| {
-            fstem
+            // Turns out file_stem() doesn't handle multiple extensions and
-                .file_stem()
+            // file_prefix() is still in not stable.
-                .and_then(|s| s.to_str())
+            Some(fname.file_stem()?.to_str()?.split_once('.')?.0.to_owned())
                .expect("Invalid file name")
                .parse()
                .ok()
        })
        .filter_map(|fname| fname.parse().ok())
        .collect()
 }
-pub fn add_pages(db: &rusqlite::Connection, dir: &Path, category: &str) {
+fn read_timestamp_from_dir(
    dir: &Path,
    timestamp: &chrono::DateTime<chrono::Utc>,
 ) -> Option<String> {
    // First check for the normal html version, which we can just read straight.
    let page_path = dir.join(format!("{}.{}", timestamp.timestamp(), "html"));
    if page_path.exists() {
        return std::fs::read_to_string(&page_path)
            .inspect_err(|e| error!("Failed reading {}, error:{e}", page_path.display()))
            .ok();
    }
    // And now if it's compresed but with zstd.
    let page_path = dir.join(format!("{}.{}.{}", timestamp.timestamp(), "html", "zst"));
    if page_path.exists() {
        let f = std::fs::File::open(&page_path)
            .inspect_err(|e| error!("Failed reading {}, error:{e}", page_path.display()))
            .ok()?;
        let mut s = String::new();
        zstd::Decoder::new(f).ok()?.read_to_string(&mut s).ok()?;
        return Some(s);
    }
    error!(
        "Failed to lookup file for timestamp {} in {}, bailing ...",
        timestamp,
        dir.display()
    );
    None
 }
 pub fn parse_dir(dir: &Path, category: &str, db: &rusqlite::Connection) -> Option<usize> {
    // Ensure the category is created.
    let url_fpath = dir.join("url.json");
    let url_contents = std::fs::read_to_string(&url_fpath)
        .inspect_err(|e| error!("Failed reading {}: {e}", url_fpath.display()))
-        .unwrap();
+        .ok()?;
    #[derive(Deserialize)]
    struct URLJSON {
@@ -50,75 +80,59 @@ pub fn add_pages(db: &rusqlite::Connection, dir: &Path, category: &str) {
    // See all pages haven't been seen before.
    let query_start_time = Instant::now();
-    let to_parse = timestamps_from_dir(dir)
+    let to_parse = timestamps_from_dir(dir).into_iter().filter(|t| {
-        .into_iter()
+        let ts = chrono::DateTime::from_timestamp(*t, 0).unwrap();
-        .filter(|t| {
+        let p = ParsedPage::lookup(&db, ts);
            let ts = chrono::DateTime::from_timestamp(*t, 0).unwrap();
            let p = Page::lookup(&db, ts);
            // Timestamp never seen before, lets pass it on.
            if p.is_none() {
                info!(
                    "Page Timestamp:{} Catagory:{category} never seen before",
                    ts.timestamp()
                );
                return true;
            }
            // Timestamp was seen before *and* from the same catagory, don't pass it on.
            if p.unwrap().category == *category {
                info!(
                    "Page Timestamp:{} Catagory:{category} seen before, skipping",
                    ts.timestamp()
                );
                return false;
            }
        // Timestamp never seen before, lets pass it on.
        if p.is_none() {
            info!(
-                "Page Timestamp:{} Catagory:{category} seen before, but not of catagory:{category}",
+                "Page Timestamp:{} Catagory:{category} never seen before, processing ...",
                ts.timestamp()
            );
            return true;
-        })
+        }
-        .collect::<Vec<_>>();
+
        // Timestamp was seen before *and* from the same catagory, don't pass
        // it on.
        if p.unwrap().category == *category {
            info!(
                "Page Timestamp:{} Catagory:{category} seen before, skipping ...",
                ts.timestamp()
            );
            return false;
        }
        info!(
            "Page Timestamp:{} Catagory:{category} seen before, but not of catagory:{category}, processing ...",
            ts.timestamp()
        );
        return true;
    }).collect::<Vec<_>>();
    let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
    info!("Time spent finding pages to parse:{total_query_time} ms");
-    // Say we are going to parse the pages.
+    // For each page, read the file and parse it.
    let query_start_time = Instant::now();
    let pages = to_parse.iter().map(|p| Page {
        timestamp: chrono::DateTime::from_timestamp(*p, 0).unwrap(),
        category: category.to_string(),
        status: crate::db::ParsedPageStatus::PendingParse,
    });
    for p in pages {
        p.add_or_update(&db);
    }
    let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
    info!("Time spent inserting pages marked as ready to parse:{total_query_time} ms");
 }
 pub fn parse_pages(db: &rusqlite::Connection, dir: &Path, category: &str, batch: usize) {
    let to_parse = Page::lookup_status(&db, ParsedPageStatus::PendingParse, category, batch);
    let query_start_time = Instant::now();
    let to_add = to_parse
        .par_iter()
        .map(|p| {
-            let page_path = dir
+            let ts = chrono::DateTime::from_timestamp(*p, 0).unwrap();
-                .join(category)
+            let paged_info = ParsedPage {
-                .join(format!("{}.html", p.timestamp.timestamp()));
+                timestamp: ts,
-            let page_contents = std::fs::read_to_string(&page_path)
+                category: category.to_string(),
-                .inspect_err(|e| error!("Failed reading {}, error:{e}", page_path.display()))
+            };
-                .ok()?;
+
            let page_contents = read_timestamp_from_dir(dir, &ts)?;
            let elements =
-                parser_ebay::parse_from_ebay_page(&page_contents, &p.timestamp, &category).unwrap();
+                parser_ebay::parse_from_ebay_page(&page_contents, &ts, &category).unwrap();
            info!(
                "Page Timestamp:{} Catagory:{category}, found {} elements",
-                p.timestamp.timestamp(),
+                ts.timestamp(),
                elements.len()
            );
-            Some((p, elements))
+            Some((paged_info, elements))
        })
        .collect::<Vec<_>>();
    let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
@@ -126,6 +140,7 @@ pub fn parse_pages(db: &rusqlite::Connection, dir: &Path, category: &str, batch:
    // And lastly add it to our database!
    let query_start_time = Instant::now();
    let mut added_count = 0;
    for iter in to_add {
        if iter.is_none() {
            continue;
@@ -134,6 +149,7 @@ pub fn parse_pages(db: &rusqlite::Connection, dir: &Path, category: &str, batch:
        paged_info.add_or_update(&db);
        for e in elements {
            added_count += 1;
            e.0.add_or_update(&db);
            e.1.add_or_update(&db);
            debug!(
@@ -146,4 +162,6 @@ pub fn parse_pages(db: &rusqlite::Connection, dir: &Path, category: &str, batch:
    }
    let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
    info!("Time spent adding parsed pages: {total_query_time} ms");
    return Some(added_count);
 }
--- a/src/parser_ebay.rs
+++ b/src/parser_ebay.rs
@@ -172,12 +172,22 @@ pub fn parse_from_ebay_page(
 mod tests {
    use super::*;
    use similar_asserts::assert_eq;
    use std::io::Read;
    use zstd;
    #[test_log::test]
    fn parse() {
        let timestamp = chrono::DateTime::from_timestamp(1750369463, 0).unwrap();
-        let html = include_str!("../test_data/scraper/raw_scraped/ssd/1750369463.html");
+        let zstd = include_bytes!("../test_data/scraper/raw_scraped/ssd/1750369463.html.zst");
-        let parsed = parse_from_ebay_page(html, &timestamp, "ssd").unwrap();
+        let cursor = std::io::Cursor::new(zstd);
        let mut html = String::new();
        zstd::Decoder::new(cursor)
            .unwrap()
            .read_to_string(&mut html)
            .unwrap();
        let parsed = parse_from_ebay_page(&html, &timestamp, "ssd").unwrap();
        // assert_eq!(parsed.len(), 62);
        let parsed = parsed.first_chunk::<10>().unwrap();
--- a/systemd/scraper_fetch.sh
+++ b/systemd/scraper_fetch.sh
@@ -5,6 +5,8 @@ URL_PER_PAGE_240="&_ipg=240"
 URL_MIN_PRICE_USD_60="&_udlo=60.00"
 URL_SEARCHTERM_NONE="&_nkw="
 URL_LOCATION_NORTHAMERICA="&LH_PrefLoc=3"
 URL_SORTBY_NEWLY_LISTED="&_sop=10"
 URL_SORTBY_ENDING_SOONEST="&_sop=1"
 URL_BASE_LISTING="https://www.ebay.com/sch/i.html?"
 if [ -z "${XDG_DATA_HOME}" ]; then
@@ -12,20 +14,85 @@ if [ -z "${XDG_DATA_HOME}" ]; then
    exit
 fi
-DIR_SSDS="$XDG_DATA_HOME/ebay_scraper/raw_scraped/ssd"
+DIR_SSDS="$XDG_DATA_HOME/scraper/raw_scraped/ssd"
 mkdir -p "$DIR_SSDS"
 if [ ! -s "$DIR_SSDS/url.json" ]; then
    URL_CATEGORY_SSD="&_sacat=175669"
-    URL_SSDS="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_SSD&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1&_sop=10$URL_PER_PAGE_240"
+    URL_SSDS="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_SSD&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1$URL_PER_PAGE_240"
    echo "{\"url\": \"$URL_SSDS\"}" > "$DIR_SSDS/url.json"
 fi
-wget -o "$DIR_SSDS/$(date +%s).html" "$(jq '.url' $DIR_SSDS/url.json)"
+curl "$(jq '.url' $DIR_SSDS/url.json --raw-output)$URL_SORTBY_NEWLY_LISTED" | zstd -z --ultra -19 -o "$DIR_SSDS/$(date +%s).html.zst"
 sleep 2
 curl "$(jq '.url' $DIR_SSDS/url.json --raw-output)$URL_SORTBY_ENDING_SOONEST" | zstd -z --ultra -19 -o "$DIR_SSDS/$(date +%s).html.zst"
-DIR_MINIPC="$XDG_DATA_HOME/ebay_scraper/raw_scraped/minipc"
+DIR_MINIPC="$XDG_DATA_HOME/scraper/raw_scraped/minipc"
 mkdir -p "$DIR_MINIPC"
 if [ ! -s "$DIR_MINIPC/url.json" ]; then
    URL_CATEGORY_MINIPC_ALLINONE="&_sacat=179"
-    URL_MINIPC="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_MINIPC_ALLINONE&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1&_sop=10$URL_PER_PAGE_240"
+    URL_MINIPC="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_MINIPC_ALLINONE&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1$URL_PER_PAGE_240"
    echo "{\"url\": \"$URL_MINIPC\"}" > "$DIR_MINIPC/url.json"
 fi
-wget -o "$DIR_MINIPC/$(date +%s).html" "$(jq '.url' $DIR_MINIPC/url.json)"
+curl "$(jq '.url' $DIR_MINIPC/url.json --raw-output)$URL_SORTBY_NEWLY_LISTED" | zstd -z --ultra -19 -o "$DIR_MINIPC/$(date +%s).html.zst"
 sleep 2
 curl "$(jq '.url' $DIR_MINIPC/url.json --raw-output)$URL_SORTBY_ENDING_SOONEST" | zstd -z --ultra -19 -o "$DIR_MINIPC/$(date +%s).html.zst"
 # Little helper to ensure we see entries in journald
 echo Done
 # If needing to do a mass compression;
 # fd '\.html$' -x zstd -z --ultra -19 -o {}.zst {}
 # Level compression analysis;
 #
 # A single scraped result;
 # for lvl in $(seq 3 22); zstd --compress --ultra -o 1755012328.html.zst$lvl -$lvl 1755012328.html; end
 # 1755012328.html      :  9.04%   (  2.60 MiB =>    240 KiB, 1755012328.html.zst3)
 # 1755012328.html      :  9.04%   (  2.60 MiB =>    240 KiB, 1755012328.html.zst4)
 # 1755012328.html      :  8.80%   (  2.60 MiB =>    234 KiB, 1755012328.html.zst5)
 # 1755012328.html      :  8.58%   (  2.60 MiB =>    228 KiB, 1755012328.html.zst6)
 # 1755012328.html      :  8.54%   (  2.60 MiB =>    227 KiB, 1755012328.html.zst7)
 # 1755012328.html      :  8.45%   (  2.60 MiB =>    225 KiB, 1755012328.html.zst8)
 # 1755012328.html      :  8.34%   (  2.60 MiB =>    222 KiB, 1755012328.html.zst9)
 # 1755012328.html      :  8.30%   (  2.60 MiB =>    221 KiB, 1755012328.html.zst10)
 # 1755012328.html      :  8.28%   (  2.60 MiB =>    220 KiB, 1755012328.html.zst11)
 # 1755012328.html      :  8.28%   (  2.60 MiB =>    220 KiB, 1755012328.html.zst12)
 # 1755012328.html      :  8.32%   (  2.60 MiB =>    221 KiB, 1755012328.html.zst13)
 # 1755012328.html      :  8.29%   (  2.60 MiB =>    221 KiB, 1755012328.html.zst14)
 # 1755012328.html      :  8.25%   (  2.60 MiB =>    219 KiB, 1755012328.html.zst15)
 # 1755012328.html      :  8.28%   (  2.60 MiB =>    220 KiB, 1755012328.html.zst16)
 # 1755012328.html      :  8.20%   (  2.60 MiB =>    218 KiB, 1755012328.html.zst17)
 # 1755012328.html      :  8.23%   (  2.60 MiB =>    219 KiB, 1755012328.html.zst18)
 # 1755012328.html      :  7.99%   (  2.60 MiB =>    213 KiB, 1755012328.html.zst19)
 # 1755012328.html      :  7.99%   (  2.60 MiB =>    213 KiB, 1755012328.html.zst20)
 # 1755012328.html      :  7.93%   (  2.60 MiB =>    211 KiB, 1755012328.html.zst21)
 # 1755012328.html      :  7.91%   (  2.60 MiB =>    211 KiB, 1755012328.html.zst22)
 #
 # Lets see if we get benefits tar'ing and them compressing;
 # -rw-r--r-- 1 hak8or users 2.6M Sep  1 10:11 ./1755012328.html
 # -rw-r--r-- 1 hak8or users 2.6M Sep  1 10:11 ./1755012331.html
 # -rw-r--r-- 1 hak8or users 2.6M Sep  1 10:11 ./1755015932.html
 # -rw-r--r-- 1 hak8or users 2.6M Sep  1 10:11 ./1755015929.html
 # -rw-r--r-- 1 hak8or users 2.6M Sep  1 10:11 ./1755019567.html
 # -rw-r--r-- 1 hak8or users 2.6M Sep  1 10:11 ./1755019564.html
 # -rw-r--r-- 1 hak8or users 16M Sep  1 12:23 175501.tar
 # ➜ for lvl in $(seq 3 22); zstd --compress --ultra -o 175501.tar.$lvl -$lvl 175501.tar; end
 # 175501.tar           :  8.91%   (  15.6 MiB =>   1.39 MiB, 175501.tar.3)
 # 175501.tar           :  8.92%   (  15.6 MiB =>   1.39 MiB, 175501.tar.4)
 # 175501.tar           :  8.65%   (  15.6 MiB =>   1.35 MiB, 175501.tar.5)
 # 175501.tar           :  8.42%   (  15.6 MiB =>   1.31 MiB, 175501.tar.6)
 # 175501.tar           :  8.36%   (  15.6 MiB =>   1.30 MiB, 175501.tar.7)
 # 175501.tar           :  8.25%   (  15.6 MiB =>   1.28 MiB, 175501.tar.8)
 # 175501.tar           :  5.36%   (  15.6 MiB =>    854 KiB, 175501.tar.9)
 # 175501.tar           :  5.32%   (  15.6 MiB =>    847 KiB, 175501.tar.10)
 # 175501.tar           :  5.30%   (  15.6 MiB =>    844 KiB, 175501.tar.11)
 # 175501.tar           :  5.30%   (  15.6 MiB =>    844 KiB, 175501.tar.12)
 # 175501.tar           :  5.48%   (  15.6 MiB =>    872 KiB, 175501.tar.13)
 # 175501.tar           :  5.42%   (  15.6 MiB =>    864 KiB, 175501.tar.14)
 # 175501.tar           :  5.19%   (  15.6 MiB =>    828 KiB, 175501.tar.15)
 # 175501.tar           :  5.31%   (  15.6 MiB =>    845 KiB, 175501.tar.16)
 # 175501.tar           :  5.01%   (  15.6 MiB =>    798 KiB, 175501.tar.17)
 # 175501.tar           :  5.04%   (  15.6 MiB =>    803 KiB, 175501.tar.18)
 # 175501.tar           :  4.84%   (  15.6 MiB =>    771 KiB, 175501.tar.19)
 # 175501.tar           :  4.79%   (  15.6 MiB =>    764 KiB, 175501.tar.20)
 # 175501.tar           :  4.74%   (  15.6 MiB =>    755 KiB, 175501.tar.21)
 # 175501.tar           :  4.73%   (  15.6 MiB =>    753 KiB, 175501.tar.22)
--- a/test_data/scraper/raw_scraped/ssd/1750369463.html
+++ b/test_data/scraper/raw_scraped/ssd/1750369463.html
--- a/test_data/scraper/raw_scraped/ssd/1750369463.html.zst
+++ b/test_data/scraper/raw_scraped/ssd/1750369463.html.zst