4 Commits

Author SHA1 Message Date
cb8025becd Fetch script now much more capable with headed chromium + playright
All checks were successful
Cargo Build & Test / Rust project - latest (1.86) (push) Successful in 4m29s
Cargo Build & Test / Rust project - latest (1.85.1) (push) Successful in 4m47s
Cargo Build & Test / Rust project - latest (1.87) (push) Successful in 5m7s
Cargo Build & Test / Rust project - latest (1.88) (push) Successful in 11m30s
2025-09-09 22:47:45 -04:00
0039078f41 Moaaarrr
All checks were successful
Cargo Build & Test / Rust project - latest (1.86) (push) Successful in 4m26s
Cargo Build & Test / Rust project - latest (1.87) (push) Successful in 4m42s
Cargo Build & Test / Rust project - latest (1.85.1) (push) Successful in 5m15s
Cargo Build & Test / Rust project - latest (1.88) (push) Successful in 11m40s
2025-09-09 00:31:23 -04:00
4ae1622f02 Add LLM based parsing
All checks were successful
Cargo Build & Test / Rust project - latest (1.88) (push) Successful in 4m11s
Cargo Build & Test / Rust project - latest (1.87) (push) Successful in 4m34s
Cargo Build & Test / Rust project - latest (1.86) (push) Successful in 5m14s
Cargo Build & Test / Rust project - latest (1.85.1) (push) Successful in 11m4s
2025-09-07 00:08:06 -04:00
b538dd8012 Allow saving and ingesting from zstd compressed scrapes
All checks were successful
Cargo Build & Test / Rust project - latest (1.86) (push) Successful in 3m46s
Cargo Build & Test / Rust project - latest (1.85.1) (push) Successful in 4m3s
Cargo Build & Test / Rust project - latest (1.88) (push) Successful in 4m14s
Cargo Build & Test / Rust project - latest (1.87) (push) Successful in 9m19s
2025-09-01 14:45:21 -04:00
13 changed files with 1545 additions and 8604 deletions

700
Cargo.lock generated
View File

@@ -39,8 +39,8 @@ dependencies = [
"flate2",
"foldhash",
"futures-core",
"h2",
"http",
"h2 0.3.26",
"http 0.2.12",
"httparse",
"httpdate",
"itoa",
@@ -76,7 +76,7 @@ checksum = "13d324164c51f63867b57e73ba5936ea151b8a41a1d23d1031eeb9f70d0236f8"
dependencies = [
"bytestring",
"cfg-if",
"http",
"http 0.2.12",
"regex",
"regex-lite",
"serde",
@@ -289,6 +289,12 @@ dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "atomic-waker"
version = "1.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
[[package]]
name = "autocfg"
version = "1.5.0"
@@ -491,6 +497,16 @@ dependencies = [
"version_check",
]
[[package]]
name = "core-foundation"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f"
dependencies = [
"core-foundation-sys",
"libc",
]
[[package]]
name = "core-foundation-sys"
version = "0.8.7"
@@ -679,9 +695,12 @@ dependencies = [
"chrono",
"clap",
"dirs",
"futures",
"lazy_static",
"num_enum",
"rayon",
"regex",
"reqwest",
"rusqlite",
"scraper",
"serde",
@@ -690,6 +709,7 @@ dependencies = [
"test-log",
"tracing",
"tracing-subscriber",
"zstd",
]
[[package]]
@@ -746,6 +766,16 @@ version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
[[package]]
name = "errno"
version = "0.3.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad"
dependencies = [
"libc",
"windows-sys 0.59.0",
]
[[package]]
name = "fallible-iterator"
version = "0.3.0"
@@ -758,6 +788,12 @@ version = "0.1.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
[[package]]
name = "fastrand"
version = "2.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
[[package]]
name = "flate2"
version = "1.1.2"
@@ -780,6 +816,21 @@ version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
[[package]]
name = "foreign-types"
version = "0.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
dependencies = [
"foreign-types-shared",
]
[[package]]
name = "foreign-types-shared"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
[[package]]
name = "form_urlencoded"
version = "1.2.1"
@@ -799,12 +850,65 @@ dependencies = [
"new_debug_unreachable",
]
[[package]]
name = "futures"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
dependencies = [
"futures-channel",
"futures-core",
"futures-executor",
"futures-io",
"futures-sink",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-channel"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
dependencies = [
"futures-core",
"futures-sink",
]
[[package]]
name = "futures-core"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
[[package]]
name = "futures-executor"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f"
dependencies = [
"futures-core",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-io"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
[[package]]
name = "futures-macro"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "futures-sink"
version = "0.3.31"
@@ -823,10 +927,16 @@ version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
dependencies = [
"futures-channel",
"futures-core",
"futures-io",
"futures-macro",
"futures-sink",
"futures-task",
"memchr",
"pin-project-lite",
"pin-utils",
"slab",
]
[[package]]
@@ -897,7 +1007,26 @@ dependencies = [
"futures-core",
"futures-sink",
"futures-util",
"http",
"http 0.2.12",
"indexmap",
"slab",
"tokio",
"tokio-util",
"tracing",
]
[[package]]
name = "h2"
version = "0.4.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f3c0b69cfcb4e1b9f1bf2f53f95f766e4661169728ec61cd3fe5a0166f2d1386"
dependencies = [
"atomic-waker",
"bytes",
"fnv",
"futures-core",
"futures-sink",
"http 1.3.1",
"indexmap",
"slab",
"tokio",
@@ -952,6 +1081,40 @@ dependencies = [
"itoa",
]
[[package]]
name = "http"
version = "1.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565"
dependencies = [
"bytes",
"fnv",
"itoa",
]
[[package]]
name = "http-body"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
dependencies = [
"bytes",
"http 1.3.1",
]
[[package]]
name = "http-body-util"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
dependencies = [
"bytes",
"futures-core",
"http 1.3.1",
"http-body",
"pin-project-lite",
]
[[package]]
name = "httparse"
version = "1.10.1"
@@ -964,6 +1127,86 @@ version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
[[package]]
name = "hyper"
version = "1.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb3aa54a13a0dfe7fbe3a59e0c76093041720fdc77b110cc0fc260fafb4dc51e"
dependencies = [
"atomic-waker",
"bytes",
"futures-channel",
"futures-core",
"h2 0.4.12",
"http 1.3.1",
"http-body",
"httparse",
"itoa",
"pin-project-lite",
"pin-utils",
"smallvec",
"tokio",
"want",
]
[[package]]
name = "hyper-rustls"
version = "0.27.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58"
dependencies = [
"http 1.3.1",
"hyper",
"hyper-util",
"rustls",
"rustls-pki-types",
"tokio",
"tokio-rustls",
"tower-service",
]
[[package]]
name = "hyper-tls"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0"
dependencies = [
"bytes",
"http-body-util",
"hyper",
"hyper-util",
"native-tls",
"tokio",
"tokio-native-tls",
"tower-service",
]
[[package]]
name = "hyper-util"
version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8d9b05277c7e8da2c93a568989bb6207bef0112e8d17df7a6eda4a3cf143bc5e"
dependencies = [
"base64",
"bytes",
"futures-channel",
"futures-core",
"futures-util",
"http 1.3.1",
"http-body",
"hyper",
"ipnet",
"libc",
"percent-encoding",
"pin-project-lite",
"socket2",
"system-configuration",
"tokio",
"tower-service",
"tracing",
"windows-registry",
]
[[package]]
name = "iana-time-zone"
version = "0.1.63"
@@ -1111,6 +1354,22 @@ dependencies = [
"hashbrown",
]
[[package]]
name = "ipnet"
version = "2.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130"
[[package]]
name = "iri-string"
version = "0.7.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dbc5ebe9c3a1a7a5127f920a418f7585e9e758e911d0466ed004f393b0e380b2"
dependencies = [
"memchr",
"serde",
]
[[package]]
name = "is_terminal_polyfill"
version = "1.70.1"
@@ -1182,6 +1441,12 @@ dependencies = [
"vcpkg",
]
[[package]]
name = "linux-raw-sys"
version = "0.9.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12"
[[package]]
name = "litemap"
version = "0.8.0"
@@ -1294,6 +1559,23 @@ dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "native-tls"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e"
dependencies = [
"libc",
"log",
"openssl",
"openssl-probe",
"openssl-sys",
"schannel",
"security-framework",
"security-framework-sys",
"tempfile",
]
[[package]]
name = "new_debug_unreachable"
version = "1.0.6"
@@ -1325,6 +1607,28 @@ dependencies = [
"autocfg",
]
[[package]]
name = "num_enum"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a973b4e44ce6cad84ce69d797acf9a044532e4184c4f267913d1b546a0727b7a"
dependencies = [
"num_enum_derive",
"rustversion",
]
[[package]]
name = "num_enum_derive"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77e878c846a8abae00dd069496dbe8751b16ac1c3d6bd2a7283a938e8228f90d"
dependencies = [
"proc-macro-crate",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "object"
version = "0.36.7"
@@ -1346,6 +1650,50 @@ version = "1.70.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad"
[[package]]
name = "openssl"
version = "0.10.73"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8505734d46c8ab1e19a1dce3aef597ad87dcb4c37e7188231769bd6bd51cebf8"
dependencies = [
"bitflags",
"cfg-if",
"foreign-types",
"libc",
"once_cell",
"openssl-macros",
"openssl-sys",
]
[[package]]
name = "openssl-macros"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "openssl-probe"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e"
[[package]]
name = "openssl-sys"
version = "0.9.109"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90096e2e47630d78b7d1c20952dc621f957103f8bc2c8359ec81290d75238571"
dependencies = [
"cc",
"libc",
"pkg-config",
"vcpkg",
]
[[package]]
name = "option-ext"
version = "0.2.0"
@@ -1487,6 +1835,15 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "proc-macro-crate"
version = "3.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edce586971a4dfaa28950c6f18ed55e0406c1ab88bbce2c6f6293a7aaba73d35"
dependencies = [
"toml_edit",
]
[[package]]
name = "proc-macro2"
version = "1.0.95"
@@ -1645,6 +2002,62 @@ version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
[[package]]
name = "reqwest"
version = "0.12.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d429f34c8092b2d42c7c93cec323bb4adeb7c67698f70839adec842ec10c7ceb"
dependencies = [
"base64",
"bytes",
"encoding_rs",
"futures-channel",
"futures-core",
"futures-util",
"h2 0.4.12",
"http 1.3.1",
"http-body",
"http-body-util",
"hyper",
"hyper-rustls",
"hyper-tls",
"hyper-util",
"js-sys",
"log",
"mime",
"native-tls",
"percent-encoding",
"pin-project-lite",
"rustls-pki-types",
"serde",
"serde_json",
"serde_urlencoded",
"sync_wrapper",
"tokio",
"tokio-native-tls",
"tower",
"tower-http",
"tower-service",
"url",
"wasm-bindgen",
"wasm-bindgen-futures",
"web-sys",
]
[[package]]
name = "ring"
version = "0.17.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
dependencies = [
"cc",
"cfg-if",
"getrandom 0.2.16",
"libc",
"untrusted",
"windows-sys 0.52.0",
]
[[package]]
name = "rusqlite"
version = "0.36.0"
@@ -1666,6 +2079,52 @@ version = "0.1.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "989e6739f80c4ad5b13e0fd7fe89531180375b18520cc8c82080e4dc4035b84f"
[[package]]
name = "rustix"
version = "1.0.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "11181fbabf243db407ef8df94a6ce0b2f9a733bd8be4ad02b4eda9602296cac8"
dependencies = [
"bitflags",
"errno",
"libc",
"linux-raw-sys",
"windows-sys 0.59.0",
]
[[package]]
name = "rustls"
version = "0.23.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0ebcbd2f03de0fc1122ad9bb24b127a5a6cd51d72604a3f3c50ac459762b6cc"
dependencies = [
"once_cell",
"rustls-pki-types",
"rustls-webpki",
"subtle",
"zeroize",
]
[[package]]
name = "rustls-pki-types"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79"
dependencies = [
"zeroize",
]
[[package]]
name = "rustls-webpki"
version = "0.103.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0a17884ae0c1b773f1ccd2bd4a8c72f16da897310a98b0e84bf349ad5ead92fc"
dependencies = [
"ring",
"rustls-pki-types",
"untrusted",
]
[[package]]
name = "rustversion"
version = "1.0.21"
@@ -1678,6 +2137,15 @@ version = "1.0.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
[[package]]
name = "schannel"
version = "0.1.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f29ebaa345f945cec9fbbc532eb307f0fdad8161f281b6369539c8d84876b3d"
dependencies = [
"windows-sys 0.59.0",
]
[[package]]
name = "scopeguard"
version = "1.2.0"
@@ -1699,6 +2167,29 @@ dependencies = [
"tendril",
]
[[package]]
name = "security-framework"
version = "2.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02"
dependencies = [
"bitflags",
"core-foundation",
"core-foundation-sys",
"libc",
"security-framework-sys",
]
[[package]]
name = "security-framework-sys"
version = "2.14.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "49db231d56a190491cb4aeda9527f1ad45345af50b0851622a7adb8c03b01c32"
dependencies = [
"core-foundation-sys",
"libc",
]
[[package]]
name = "selectors"
version = "0.26.0"
@@ -1891,6 +2382,12 @@ version = "0.11.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
[[package]]
name = "subtle"
version = "2.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
[[package]]
name = "syn"
version = "2.0.103"
@@ -1902,6 +2399,15 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "sync_wrapper"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263"
dependencies = [
"futures-core",
]
[[package]]
name = "synstructure"
version = "0.13.2"
@@ -1913,6 +2419,40 @@ dependencies = [
"syn",
]
[[package]]
name = "system-configuration"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b"
dependencies = [
"bitflags",
"core-foundation",
"system-configuration-sys",
]
[[package]]
name = "system-configuration-sys"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4"
dependencies = [
"core-foundation-sys",
"libc",
]
[[package]]
name = "tempfile"
version = "3.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "15b61f8f20e3a6f7e0649d825294eaf317edce30f82cf6026e7e4cb9222a7d1e"
dependencies = [
"fastrand",
"getrandom 0.3.3",
"once_cell",
"rustix",
"windows-sys 0.59.0",
]
[[package]]
name = "tendril"
version = "0.4.3"
@@ -2033,6 +2573,26 @@ dependencies = [
"windows-sys 0.52.0",
]
[[package]]
name = "tokio-native-tls"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
dependencies = [
"native-tls",
"tokio",
]
[[package]]
name = "tokio-rustls"
version = "0.26.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b"
dependencies = [
"rustls",
"tokio",
]
[[package]]
name = "tokio-util"
version = "0.7.15"
@@ -2046,6 +2606,68 @@ dependencies = [
"tokio",
]
[[package]]
name = "toml_datetime"
version = "0.6.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c"
[[package]]
name = "toml_edit"
version = "0.22.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a"
dependencies = [
"indexmap",
"toml_datetime",
"winnow",
]
[[package]]
name = "tower"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9"
dependencies = [
"futures-core",
"futures-util",
"pin-project-lite",
"sync_wrapper",
"tokio",
"tower-layer",
"tower-service",
]
[[package]]
name = "tower-http"
version = "0.6.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2"
dependencies = [
"bitflags",
"bytes",
"futures-util",
"http 1.3.1",
"http-body",
"iri-string",
"pin-project-lite",
"tower",
"tower-layer",
"tower-service",
]
[[package]]
name = "tower-layer"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
[[package]]
name = "tower-service"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
[[package]]
name = "tracing"
version = "0.1.41"
@@ -2108,6 +2730,12 @@ dependencies = [
"tracing-log",
]
[[package]]
name = "try-lock"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
[[package]]
name = "typenum"
version = "1.18.0"
@@ -2138,6 +2766,12 @@ version = "0.2.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
[[package]]
name = "untrusted"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
[[package]]
name = "url"
version = "2.5.4"
@@ -2185,6 +2819,15 @@ version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
[[package]]
name = "want"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e"
dependencies = [
"try-lock",
]
[[package]]
name = "wasi"
version = "0.11.1+wasi-snapshot-preview1"
@@ -2226,6 +2869,19 @@ dependencies = [
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-futures"
version = "0.4.50"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61"
dependencies = [
"cfg-if",
"js-sys",
"once_cell",
"wasm-bindgen",
"web-sys",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.100"
@@ -2258,6 +2914,16 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "web-sys"
version = "0.3.77"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2"
dependencies = [
"js-sys",
"wasm-bindgen",
]
[[package]]
name = "winapi"
version = "0.3.9"
@@ -2321,6 +2987,17 @@ version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a"
[[package]]
name = "windows-registry"
version = "0.5.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b8a9ed28765efc97bbc954883f4e6796c33a06546ebafacbabee9696967499e"
dependencies = [
"windows-link",
"windows-result",
"windows-strings",
]
[[package]]
name = "windows-result"
version = "0.3.4"
@@ -2421,6 +3098,15 @@ version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "winnow"
version = "0.7.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "21a0236b59786fed61e2a80582dd500fe61f18b5dca67a4a067d0bc9039339cf"
dependencies = [
"memchr",
]
[[package]]
name = "wit-bindgen-rt"
version = "0.39.0"
@@ -2501,6 +3187,12 @@ dependencies = [
"synstructure",
]
[[package]]
name = "zeroize"
version = "1.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde"
[[package]]
name = "zerotrie"
version = "0.2.2"

View File

@@ -8,9 +8,12 @@ actix-web = "4.11.0"
chrono = { version = "0.4.41", features = ["serde"] }
clap = { version = "4.5.40", features = ["derive"] }
dirs = "6.0.0"
futures = "0.3.31"
lazy_static = "1.5.0"
num_enum = "0.7.4"
rayon = "1.10.0"
regex = "1.11.1"
reqwest = { version = "0.12.23", features = ["blocking"] }
rusqlite = { version = "0.36.0", features = ["bundled", "chrono"] }
scraper = "0.23.1"
serde = { version = "1.0.219", features = ["derive"] }
@@ -18,6 +21,7 @@ serde_json = "1.0.140"
test-log = { version = "0.2.17", features = ["trace"] }
tracing = { version = "0.1.41", features = ["attributes"] }
tracing-subscriber = { version = "0.3.19", features = ["env-filter"] }
zstd = "0.13.3"
[dev-dependencies]
similar-asserts = "1.7.0"

110
readme.md
View File

@@ -10,3 +10,113 @@ echo run2 && http GET "$URL_BASE/listing/since/12345678/2" && \
echo run3 && http GET "$URL_BASE/listing/388484391867" && \
echo run4 && http GET "$URL_BASE/listing/286605201240/history"
```
And some jq usage for raw interaction of the data;
```bash
# Download a bunch of listings.
http https://scraper.hak8or.com/api/listings since==0 limit==20 > listings.json
# Show what a single listing looks like.
listings.json | jq '.[0]'
{
"listing": {
"id": 22563,
"item_id": 286707621236,
"title": "WD_BLACK SN770M 2TB M.2 NVMe Internal SSD (WDBDNH0020BBK-WRSN)",
"buy_it_now_price_cents": null,
"has_best_offer": false,
"image_url": "https://i.ebayimg.com/images/g/It4AAeSwzz5oddoa/s-l140.jpg"
},
"history": [
{
"item": 286707621236,
"timestamp": "2025-07-15T04:46:54Z",
"category": "ssd",
"current_bid_usd_cents": 12900
}
],
"parsed": [
{
"id": 6,
"item": 286707621236,
"total_gigabytes": 2048,
"quantity": 1,
"individual_size_gigabytes": 2048,
"parse_engine": 0,
"needed_description_check": false
}
]
}
# Show the 1st and 2nd items, but only grab a few specific entries.
cat listings_small.json | jq '[.[1:3][] | {
item_id: .listing.item_id,
title: .listing.title,
parsed: .parsed[] | {
total_gigabytes,
quantity,
individual_size_gigabytes
}
}]'
[
{
"item_id": 297545995095,
"title": "Crucial P3 Plus 1TB NVMe Internal M.2 SSD (CT1000P3PSSD8) - Barely used!",
"parsed": {
"total_gigabytes": 1024,
"quantity": 1,
"individual_size_gigabytes": 1024
}
},
{
"item_id": 127220979797,
"title": "Kingston NV2 2TB M.2 3500MG/S NVMe Internal SSD PCIe 4.0 Gen SNV2S/2000G C-/#qWT",
"parsed": {
"total_gigabytes": 2048,
"quantity": 1,
"individual_size_gigabytes": 2048
}
}
]
```
And now a LLM based parse, such that the prompt is this (189 Tokens for Gemini 2.5 Flash Lite)
```
I will provide you with a listing title I want you to analyse. Then you will tell me the total gigabytes of all drives listed in the listing, how many drives are specified in the title, and the gigabytes of each drive in the listing. Here is an example for a title of "Crucial P3 Plus 1TB NVMe Internal M.2 SSD (CT1000P3PSSD8) - Barely used!";
```
{
"total_gigabytes": 1024,
"quantity": 1,
"individual_size_gigabytes": 1024
}
```
Reply with "OK" (and _only_ "OK") if you understand this. After you reply with that, I will provide you with a title, and then you will reply with solely the requested json (and ONLY said json).
```
And passing a title of (30 tokens);
```
Lot Of 3 Western Digital PC SN740 512GB M.2 2230 NVMe Internal SSD
```
returns the following json of (41 tokens);
```json
{
"total_gigabytes": 1536,
"quantity": 3,
"individual_size_gigabytes": 512
}
```
and another example of sending (49 tokens)
```
(Lot of 6) Samsung MZ-VLB2560 256GB M.2 NVMe Internal SSD (MZVLB256HBHQ-000H1)
```
returns the following json of (42 tokens);
```json
{
"total_gigabytes": 1536,
"quantity": 6,
"individual_size_gigabytes": 256
}
```
So for 1 listing we have a 189 Token "System Prompt", then a ~45 token title prompt, and 42 Token parsed reply. Given 30,000 listings, that's 5,670,000 Token "System Prompt" as Input, 1,350,000 Token Title prompt as Input, and 1,260,000 Token Parsed information (output). Assuming Gemini 2.5 Flash Mini which is $0.10/M for input and $0.40/M for output, would pay $0.702 for input and $0.504 for output, or $1.206 total.

235
src/db.rs
View File

@@ -1,5 +1,8 @@
use chrono::{DateTime, Utc};
use num_enum::TryFromPrimitive;
use rusqlite::Connection;
use rusqlite::ToSql;
use rusqlite::types::FromSql;
use serde::Deserialize;
use serde::Serialize;
use std::path::Path;
@@ -205,15 +208,34 @@ impl ParsedPage {
}
}
#[derive(Serialize, Debug, PartialEq, Copy, Clone)]
#[repr(i64)]
#[derive(Serialize, Debug, PartialEq, Copy, Clone, PartialOrd, Ord, Eq, TryFromPrimitive)]
pub enum StorageParsingEngineVersion {
Testing = 0,
Regex = 1,
LLM = 2,
}
impl ToSql for StorageParsingEngineVersion {
fn to_sql(&self) -> rusqlite::Result<rusqlite::types::ToSqlOutput<'_>> {
Ok((*self as i64).into())
}
}
impl FromSql for StorageParsingEngineVersion {
fn column_result(value: rusqlite::types::ValueRef<'_>) -> rusqlite::types::FromSqlResult<Self> {
let v = value.as_i64()?;
Self::try_from(v).map_err(|_| rusqlite::types::FromSqlError::OutOfRange(v))
}
}
#[derive(Serialize, Debug, PartialEq, Clone)]
pub struct ParsedStorage {
pub id: i64,
pub item: i64,
pub total_gigabytes: i64,
pub quantity: i64,
pub individual_size_gigabytes: i64,
pub parse_engine: i64,
pub needed_description_check: bool,
pub parse_engine: StorageParsingEngineVersion,
pub failed_reason: String,
}
impl DBTable for ParsedStorage {
const TABLE_NAME: &'static str = "Storage_Parsed";
@@ -224,13 +246,13 @@ impl DBTable for ParsedStorage {
quantity INTEGER,
sizes_gigabytes TEXT,
parse_engine INTEGER,
need_description_check INTEGER,
failed_reason TEXT,
UNIQUE(item, parse_engine)
FOREIGN KEY(item) REFERENCES Listings(item_id)
";
fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> {
let mut stmt = conn.prepare(&format!("SELECT id, item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, need_description_check FROM {}", Self::TABLE_NAME))?;
let mut stmt = conn.prepare(&format!("SELECT id, item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, failed_reason FROM {}", Self::TABLE_NAME))?;
let iter = stmt.query_map([], |row| {
Ok(ParsedStorage {
id: row.get(0)?,
@@ -242,7 +264,7 @@ impl DBTable for ParsedStorage {
r.parse().unwrap_or(0)
},
parse_engine: row.get(5)?,
needed_description_check: row.get(6)?,
failed_reason: row.get(6)?,
})
})?;
@@ -273,7 +295,7 @@ impl ParsedStorage {
r.parse().unwrap()
},
parse_engine: row.get(5)?,
needed_description_check: row.get(6)?,
failed_reason: row.get(6)?,
})
})
.ok()
@@ -283,21 +305,26 @@ impl ParsedStorage {
}
pub fn add_or_update(&self, conn: &Connection) {
let _ = conn.execute(&format!("
let _ = conn
.execute(
&format!(
"
INSERT OR REPLACE INTO {}
(item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, need_description_check)
(item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, failed_reason)
VALUES
(?1, ?2, ?3, ?4, ?5, ?6)",
Self::TABLE_NAME),
Self::TABLE_NAME
),
(
&self.item,
self.total_gigabytes,
self.quantity,
self.individual_size_gigabytes.to_string(),
self.parse_engine,
self.needed_description_check
&self.failed_reason,
),
)
).unwrap();
.unwrap();
}
}
@@ -494,19 +521,40 @@ impl Listing {
.collect()
}
pub fn lookup_non_parsed(conn: &Connection) -> Vec<(i64, String)> {
let mut stmt = conn
.prepare(&format!(
pub fn lookup_pending_parse(
conn: &Connection,
allowed_engines: &[i64],
count_limit: u64,
) -> Vec<(i64, String)> {
let engines_filter = if !allowed_engines.is_empty() {
format!(
"AND ({})",
allowed_engines
.iter()
.map(|e| "ps.parse_engine = ".to_owned() + &e.to_string())
.collect::<Vec<_>>()
.join(" OR ")
)
} else {
String::new()
};
let query = format!(
"
SELECT ei.item_id, ei.title FROM {} AS ei
LEFT JOIN {} AS sp ON ei.item_id = sp.item
WHERE sp.item IS NULL",
SELECT listing.item_id, listing.title FROM {0} AS listing
WHERE NOT EXISTS (
SELECT 1 FROM {1} AS ps
WHERE listing.item_id = ps.item {engines_filter}
)
LIMIT {count_limit}
",
Self::TABLE_NAME,
ParsedStorage::TABLE_NAME
))
);
conn.prepare(&query)
.ok()
.unwrap();
stmt.query_map([], |row| Ok((row.get(0)?, row.get(1)?)))
.unwrap()
.query_map([], |row| Ok((row.get(0)?, row.get(1)?)))
.ok()
.unwrap()
.map(|e| e.unwrap())
@@ -565,7 +613,7 @@ pub fn listings_get_filtered(
history: ItemAppearances::lookup(conn, l.item_id),
parsed: ParsedStorage::lookup(conn, l.item_id),
})
.filter(|lr| lr.parsed.iter().any(|p| !p.needed_description_check))
.filter(|lr| lr.parsed.iter().any(|p| p.failed_reason.is_empty()))
.collect::<Vec<ListingsFilterResult>>();
info!(
"Found total {} listings since (str:{} epoch:{})",
@@ -614,6 +662,125 @@ pub fn listings_get_filtered(
listings
}
#[repr(i64)]
#[derive(Serialize, Debug, PartialEq, Copy, Clone, TryFromPrimitive)]
pub enum StorageLLMVersion {
Testing = 0,
Gemini2d5Prompt0 = 1,
}
impl ToSql for StorageLLMVersion {
fn to_sql(&self) -> rusqlite::Result<rusqlite::types::ToSqlOutput<'_>> {
Ok((*self as i64).into())
}
}
impl FromSql for StorageLLMVersion {
fn column_result(value: rusqlite::types::ValueRef<'_>) -> rusqlite::types::FromSqlResult<Self> {
let v = value.as_i64()?;
Self::try_from(v).map_err(|_| rusqlite::types::FromSqlError::OutOfRange(v))
}
}
// This is mostly meant as a way to cache all of these.
#[derive(Serialize, Debug, PartialEq, Clone)]
pub struct ParsedLLMStorageResult {
pub id: i64,
pub item_id: i64,
pub title: String,
pub quantity: i64,
pub gigabytes: i64,
pub fail_reason: String,
pub llm_id: StorageLLMVersion,
}
impl DBTable for ParsedLLMStorageResult {
const TABLE_NAME: &'static str = "ParsedLLMStorageResult";
const TABLE_SCHEMA: &'static str = "
id INTEGER PRIMARY KEY,
item_id INTEGER NOT NULL UNIQUE,
title TEXT NOT NULL,
quantity INTEGER NOT NULL,
gigabytes INTEGER NOT NULL,
fail_reason TEXT NOT NULL,
llm_id INTEGER NOT NULL
";
fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> {
let mut stmt = conn.prepare(&format!(
"SELECT id, item_id, title, quantity, gigabytes, fail_reason, llm_id FROM {}",
Self::TABLE_NAME
))?;
let iter = stmt.query_map([], |row| {
Ok(ParsedLLMStorageResult {
id: row.get(0)?,
item_id: row.get(1)?,
title: row.get(2)?,
quantity: row.get(3)?,
gigabytes: row.get(4)?,
fail_reason: row.get(5)?,
llm_id: row.get(6)?,
})
})?;
let mut result = Vec::new();
for item in iter {
result.push(item?);
}
Ok(result)
}
}
impl ParsedLLMStorageResult {
pub fn lookup(conn: &Connection, item_id: i64) -> Option<ParsedLLMStorageResult> {
let mut stmt = conn
.prepare(&format!(
"SELECT * FROM {} WHERE item_id = ?",
Self::TABLE_NAME
))
.ok()?;
stmt.query_one([item_id], |row| {
Ok(ParsedLLMStorageResult {
id: row.get(0)?,
item_id: row.get(1)?,
title: row.get(2)?,
quantity: row.get(3)?,
gigabytes: row.get(4)?,
fail_reason: row.get(5)?,
llm_id: row.get(6)?,
})
})
.ok()
}
pub fn add_or_update(&self, conn: &Connection) {
let count = conn
.execute(
&format!(
"INSERT OR REPLACE INTO {}
(
item_id,
title,
quantity,
gigabytes,
fail_reason,
llm_id
)
VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
Self::TABLE_NAME
),
(
self.item_id,
&self.title,
self.quantity,
self.gigabytes,
self.fail_reason.clone(),
self.llm_id,
),
)
.unwrap();
if count != 1 {
panic!("Expected count to be 1 but got {}", count);
}
}
}
pub fn get_initialized(path: Option<&Path>) -> Connection {
let conn = match path {
Some(p) => Connection::open(&p),
@@ -626,6 +793,7 @@ pub fn get_initialized(path: Option<&Path>) -> Connection {
ParsedStorage::initialize(&conn);
ParsedPage::initialize(&conn);
ItemAppearances::initialize(&conn);
ParsedLLMStorageResult::initialize(&conn);
conn
}
@@ -637,6 +805,7 @@ pub struct Stats {
rows_parsed_storage: i64,
rows_parsed_page: i64,
rows_item_appearances: i64,
pub rows_parsed_storage_llm: i64,
}
pub fn get_stats(conn: &Connection) -> Stats {
@@ -646,6 +815,7 @@ pub fn get_stats(conn: &Connection) -> Stats {
rows_parsed_storage: ParsedStorage::get_count(conn),
rows_parsed_page: ParsedPage::get_count(conn),
rows_item_appearances: ItemAppearances::get_count(conn),
rows_parsed_storage_llm: ParsedLLMStorageResult::get_count(conn),
}
}
@@ -667,7 +837,7 @@ mod tests {
let listing = Listing {
id: 1,
item_id: 1234,
title: "Some Title".to_string(),
title: "Lot of 2 512GB SSD 6gb/s working with 5% wear".to_string(),
buy_it_now_price_cents: Some(123),
has_best_offer: false,
image_url: "google.com".to_string(),
@@ -681,8 +851,8 @@ mod tests {
total_gigabytes: 13,
quantity: 3,
individual_size_gigabytes: 13,
parse_engine: 9,
needed_description_check: true,
parse_engine: StorageParsingEngineVersion::Testing,
failed_reason: "".to_owned(),
};
parsed.add_or_update(&db);
assert_eq!(ParsedStorage::lookup(&db, listing.item_id), vec![parsed]);
@@ -706,6 +876,21 @@ mod tests {
vec![apperance]
);
let parsedllmstorage = ParsedLLMStorageResult {
fail_reason: "Some reason".to_owned(),
gigabytes: 12,
id: 1,
item_id: 12345,
quantity: 32,
title: "Some Title".to_owned(),
llm_id: StorageLLMVersion::Testing,
};
parsedllmstorage.add_or_update(&db);
assert_eq!(
ParsedLLMStorageResult::lookup(&db, parsedllmstorage.item_id),
Some(parsedllmstorage)
);
assert_eq!(Listing::lookup_since(&db, page.timestamp, 3), vec![listing]);
assert_eq!(
Listing::lookup_since(&db, page.timestamp + chrono::Duration::seconds(1), 3),

View File

@@ -1,4 +1,5 @@
pub mod db;
pub mod parser;
pub mod parser_ebay;
pub mod parser_storage;
pub mod parser_storage_e0;
pub mod parser_storage_e1;

View File

@@ -1,17 +1,16 @@
use actix_web::{App, HttpServer, Responder, Result, get, post, web, web::Data};
use chrono::{DateTime, Utc};
use clap::Parser;
use ebay_scraper_rust::db::{
DBTable, ItemAppearances, Listing, ParsedPage, ParsedStorage, SearchURL, get_initialized,
get_stats, listings_get_filtered,
};
use ebay_scraper_rust::db;
use ebay_scraper_rust::db::DBTable;
use ebay_scraper_rust::parser::parse_dir;
use ebay_scraper_rust::parser_storage;
use ebay_scraper_rust::{parser_storage_e0, parser_storage_e1};
use futures::future::join_all;
use serde::{Deserialize, Serialize};
use std::path::PathBuf;
use std::sync::Mutex;
use std::time::Instant;
use tracing::{info, instrument};
use tracing::{error, info, instrument};
use tracing_subscriber::filter::EnvFilter;
use tracing_subscriber::fmt;
@@ -28,6 +27,13 @@ mod xdg_dirs;
)]
struct Args {}
struct AppCtx {
db: rusqlite::Connection,
db_llm: rusqlite::Connection,
download_dir: PathBuf,
llm_parser: Option<actix_web::rt::task::JoinHandle<()>>,
}
#[derive(Deserialize, Debug)]
struct ListingsFilter {
since: Option<i64>,
@@ -37,12 +43,12 @@ struct ListingsFilter {
#[get("/listings")]
async fn listings_filtered_get(
db: Data<Mutex<rusqlite::Connection>>,
ctx: Data<Mutex<AppCtx>>,
filter: web::Query<ListingsFilter>,
) -> Result<impl Responder> {
let start = Instant::now();
let res = listings_get_filtered(
&db.lock().unwrap(),
let res = db::listings_get_filtered(
&ctx.lock().unwrap().db,
&DateTime::<Utc>::from_timestamp(filter.since.unwrap_or(0), 0).unwrap(),
filter.limit.unwrap_or(1_000),
filter.cents_per_tbytes_max.unwrap_or(100_00),
@@ -57,19 +63,16 @@ async fn listings_filtered_get(
}
#[get("/listing/{id}")]
async fn listing_get(
db: Data<Mutex<rusqlite::Connection>>,
id: web::Path<i64>,
) -> Result<impl Responder> {
Ok(web::Json(Listing::lookup(&db.lock().unwrap(), *id)))
async fn listing_get(ctx: Data<Mutex<AppCtx>>, id: web::Path<i64>) -> Result<impl Responder> {
Ok(web::Json(db::Listing::lookup(&ctx.lock().unwrap().db, *id)))
}
#[get("/listing/{id}/parsed")]
async fn listing_parse_get(
db: Data<Mutex<rusqlite::Connection>>,
id: web::Path<i64>,
) -> Result<impl Responder> {
Ok(web::Json(ParsedStorage::lookup(&db.lock().unwrap(), *id)))
async fn listing_parse_get(ctx: Data<Mutex<AppCtx>>, id: web::Path<i64>) -> Result<impl Responder> {
Ok(web::Json(db::ParsedStorage::lookup(
&ctx.lock().unwrap().db,
*id,
)))
}
#[derive(Serialize)]
@@ -80,10 +83,10 @@ struct APIHistory {
#[get("/listing/{id}/history")]
async fn listing_history_get(
db: Data<Mutex<rusqlite::Connection>>,
ctx: Data<Mutex<AppCtx>>,
id: web::Path<i64>,
) -> Result<impl Responder> {
let history: Vec<_> = ItemAppearances::lookup(&db.lock().unwrap(), *id)
let history: Vec<_> = db::ItemAppearances::lookup(&ctx.lock().unwrap().db, *id)
.iter()
// .inspect(|e| info!("got: {:?}", e))
.filter_map(|e| {
@@ -96,36 +99,109 @@ async fn listing_history_get(
Ok(web::Json(history))
}
#[post("/listing/parse")]
async fn parse_listings(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
let mut cnt = 0;
let db_unlocked = db.lock().unwrap();
Listing::lookup_non_parsed(&db_unlocked)
async fn storage_parse_work(entries: &[(i64, String)]) -> Vec<db::ParsedStorage> {
let llm_futures: Vec<_> = entries
.iter()
.map(|l| parser_storage::parse_size_and_quantity(l.0, &l.1))
.inspect(|_| cnt = cnt + 1)
.for_each(|ps| ps.add_or_update(&db_unlocked));
.map(|(id, title)| parser_storage_e1::parse_size_and_quantity_llm(*id, title))
.collect();
let llm_future_results = join_all(llm_futures).await;
let llm_results = llm_future_results
.iter()
.flatten()
.map(|e| db::ParsedStorage {
id: 0,
item: e.item_id,
total_gigabytes: e.quantity * e.gigabytes,
quantity: e.quantity,
individual_size_gigabytes: e.gigabytes,
failed_reason: e.fail_reason.clone(),
parse_engine: db::StorageParsingEngineVersion::LLM,
});
// .inspect(|e| e.add_or_update(&unlocked.db))
// .map(|e| db::ParsedStorage {
// id: 0,
// item: e.item_id,
// total_gigabytes: e.quantity * e.gigabytes,
// quantity: e.quantity,
// individual_size_gigabytes: e.gigabytes,
// needed_description_check: !e.fail_reason.is_empty(),
// parse_engine: db::StorageParsingEngineVersion::LLM,
// })
// .for_each(|e| e.add_or_update(&unlocked.db));
Ok(web::Json(cnt))
// And a regex based parse.
let regex_results = entries
.iter()
.map(|(id, title)| parser_storage_e0::parse_size_and_quantity(*id, &title));
// .for_each(|e| e.add_or_update(&unlocked.db));
regex_results.chain(llm_results).collect()
}
fn storage_parse_worker(ctx: Data<Mutex<AppCtx>>) -> actix_web::rt::task::JoinHandle<()> {
actix_web::rt::spawn(async move {
loop {
actix_web::rt::time::sleep(std::time::Duration::from_millis(1000)).await;
let ctx_unlocked = ctx.lock().unwrap();
let entries = db::Listing::lookup_pending_parse(&ctx_unlocked.db, &[], 10);
let parsed = storage_parse_work(entries.as_slice()).await;
for p in parsed {
p.add_or_update(&ctx_unlocked.db);
}
}
})
}
#[post("/listing/parse")]
async fn parse_listings(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
// Prepare a background parser to go through and use an LLM to parse the
// storage info.
if ctx.lock().unwrap().llm_parser.is_none() {
ctx.clone().lock().unwrap().llm_parser = Some(storage_parse_worker(ctx.clone()));
}
// Lets grab a few entries and then try parsing them with two engines.
let ctx_locked = ctx.lock().unwrap();
let entries: Vec<_> = db::Listing::lookup_pending_parse(&ctx_locked.db, &[], 100)
.iter()
.take(10)
.map(|e| e.clone())
.collect();
for (item_id, title) in &entries {
let ps1 =
parser_storage_e1::parse_size_and_quantity(&ctx_locked.db_llm, *item_id, &title).await;
if ps1.is_some() {
// info!(
// "Parsed using an LLM title:{} and results:{:?}",
// title,
// ps1.unwrap()
// );
ps1.unwrap().add_or_update(&ctx_locked.db);
// ps1.unwrap().add_or_update(&ctx_locked.db_llm); No need
} else {
error!("Failed to parse {item_id} with title {title}");
}
}
Ok(web::Json(entries.len()))
}
#[get("/category")]
async fn category_getnames(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
Ok(web::Json(SearchURL::names(&db.lock().unwrap())))
async fn category_getnames(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
Ok(web::Json(db::SearchURL::names(&ctx.lock().unwrap().db)))
}
#[post("/category/{category}/parse")]
#[instrument(skip_all)]
async fn category_parse(
db: Data<Mutex<rusqlite::Connection>>,
downloaddir: Data<PathBuf>,
ctx: Data<Mutex<AppCtx>>,
category: web::Path<String>,
) -> Result<impl Responder> {
let start = Instant::now();
let ctx_unlocked = ctx.lock().unwrap();
let count = parse_dir(
&downloaddir.join(category.clone()),
&ctx_unlocked.download_dir.join(category.clone()),
&category,
&db.lock().unwrap(),
&ctx_unlocked.db,
)
.unwrap();
let elapsed = start.elapsed().as_micros() as f64 / 1000.0;
@@ -135,19 +211,22 @@ async fn category_parse(
}
#[get("/stats")]
async fn stats_get(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
Ok(web::Json(get_stats(&db.lock().unwrap())))
async fn stats_get(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
let mut stats_db = db::get_stats(&ctx.lock().unwrap().db);
let stats_db_llm = db::get_stats(&ctx.lock().unwrap().db_llm);
stats_db.rows_parsed_storage_llm = stats_db_llm.rows_parsed_storage_llm;
Ok(web::Json(stats_db))
}
#[get("/admin")]
async fn admin_get(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
let db = db.lock().unwrap();
async fn admin_get(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
let ctx_locked = ctx.lock().unwrap();
let query_start_time = Instant::now();
let search_urls = SearchURL::get_all(&db).unwrap_or_default();
let parsed_pages = ParsedPage::get_all(&db).unwrap_or_default();
let parsed_storages = ParsedStorage::get_all(&db).unwrap_or_default();
let item_appearances = ItemAppearances::get_all(&db).unwrap_or_default();
let listings = Listing::get_all(&db).unwrap_or_default();
let search_urls = db::SearchURL::get_all(&ctx_locked.db).unwrap_or_default();
let parsed_pages = db::ParsedPage::get_all(&ctx_locked.db).unwrap_or_default();
let parsed_storages = db::ParsedStorage::get_all(&ctx_locked.db).unwrap_or_default();
let item_appearances = db::ItemAppearances::get_all(&ctx_locked.db).unwrap_or_default();
let listings = db::Listing::get_all(&ctx_locked.db).unwrap_or_default();
let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
let html_gen_start_time = Instant::now();
@@ -287,10 +366,22 @@ async fn main() -> std::io::Result<()> {
"Starting with scraped data dir of \"{}\".",
scrapedatadir.to_str().unwrap()
);
let db_mutex = Data::new(Mutex::new(get_initialized(None)));
let app_data = Data::new(Mutex::new(AppCtx {
download_dir: scrapedatadir.clone(),
db: db::get_initialized(None),
db_llm: {
let db_path = scrapedatadir.with_file_name("llm.sqlite");
let db = rusqlite::Connection::open(&db_path).unwrap();
db::ParsedLLMStorageResult::initialize(&db);
info!("Created {:?} for caching LLM parsed title.", db_path);
db
},
llm_parser: None,
}));
// Prepare our backend via pulling in what catagories we are preconfigured with.
SearchURL::scan(&db_mutex.lock().unwrap(), &scrapedatadir, "url.json");
db::SearchURL::scan(&app_data.lock().unwrap().db, &scrapedatadir, "url.json");
HttpServer::new(move || {
App::new()
@@ -306,8 +397,7 @@ async fn main() -> std::io::Result<()> {
.service(admin_get)
.service(stats_get)
// Stuff which is passed into every request.
.app_data(db_mutex.clone())
.app_data(Data::new(scrapedatadir.clone()))
.app_data(app_data.clone())
})
.bind(("0.0.0.0", 9876))?
.run()

View File

@@ -5,9 +5,10 @@ use crate::{
use rayon::prelude::*;
use serde::Deserialize;
use serde_json;
use std::path::Path;
use std::time::Instant;
use std::{io::Read, path::Path};
use tracing::{debug, error, info};
use zstd;
fn timestamps_from_dir(path: &Path) -> Vec<i64> {
if !std::fs::exists(path).expect("Directory must exist") {
@@ -20,17 +21,46 @@ fn timestamps_from_dir(path: &Path) -> Vec<i64> {
std::fs::read_dir(path)
.unwrap()
.map(|fpath| fpath.unwrap().path())
.filter_map(|fstem| {
fstem
.file_stem()
.and_then(|s| s.to_str())
.expect("Invalid file name")
.parse()
.ok()
.filter_map(|fname| {
// Turns out file_stem() doesn't handle multiple extensions and
// file_prefix() is still in not stable.
Some(fname.file_stem()?.to_str()?.split_once('.')?.0.to_owned())
})
.filter_map(|fname| fname.parse().ok())
.collect()
}
fn read_timestamp_from_dir(
dir: &Path,
timestamp: &chrono::DateTime<chrono::Utc>,
) -> Option<String> {
// First check for the normal html version, which we can just read straight.
let page_path = dir.join(format!("{}.{}", timestamp.timestamp(), "html"));
if page_path.exists() {
return std::fs::read_to_string(&page_path)
.inspect_err(|e| error!("Failed reading {}, error:{e}", page_path.display()))
.ok();
}
// And now if it's compresed but with zstd.
let page_path = dir.join(format!("{}.{}.{}", timestamp.timestamp(), "html", "zst"));
if page_path.exists() {
let f = std::fs::File::open(&page_path)
.inspect_err(|e| error!("Failed reading {}, error:{e}", page_path.display()))
.ok()?;
let mut s = String::new();
zstd::Decoder::new(f).ok()?.read_to_string(&mut s).ok()?;
return Some(s);
}
error!(
"Failed to lookup file for timestamp {} in {}, bailing ...",
timestamp,
dir.display()
);
None
}
pub fn parse_dir(dir: &Path, category: &str, db: &rusqlite::Connection) -> Option<usize> {
// Ensure the category is created.
let url_fpath = dir.join("url.json");
@@ -93,10 +123,7 @@ pub fn parse_dir(dir: &Path, category: &str, db: &rusqlite::Connection) -> Optio
category: category.to_string(),
};
let page_path = dir.join(format!("{}.html", ts.timestamp()));
let page_contents = std::fs::read_to_string(&page_path)
.inspect_err(|e| error!("Failed reading {}, error:{e}", page_path.display()))
.ok()?;
let page_contents = read_timestamp_from_dir(dir, &ts)?;
let elements =
parser_ebay::parse_from_ebay_page(&page_contents, &ts, &category).unwrap();
info!(

View File

@@ -172,12 +172,22 @@ pub fn parse_from_ebay_page(
mod tests {
use super::*;
use similar_asserts::assert_eq;
use std::io::Read;
use zstd;
#[test_log::test]
fn parse() {
let timestamp = chrono::DateTime::from_timestamp(1750369463, 0).unwrap();
let html = include_str!("../test_data/scraper/raw_scraped/ssd/1750369463.html");
let parsed = parse_from_ebay_page(html, &timestamp, "ssd").unwrap();
let zstd = include_bytes!("../test_data/scraper/raw_scraped/ssd/1750369463.html.zst");
let cursor = std::io::Cursor::new(zstd);
let mut html = String::new();
zstd::Decoder::new(cursor)
.unwrap()
.read_to_string(&mut html)
.unwrap();
let parsed = parse_from_ebay_page(&html, &timestamp, "ssd").unwrap();
// assert_eq!(parsed.len(), 62);
let parsed = parsed.first_chunk::<10>().unwrap();

View File

@@ -32,7 +32,7 @@ pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage {
let upper_title = title.to_uppercase();
let mut total_gb = 0i64;
let mut quantity = 1i64;
let mut needed_description_check = false;
let mut failed_reason = String::new();
let mut individual_size_gb = 0i64;
for pattern in EXPLICIT_QTY_PATTERNS.iter() {
@@ -68,36 +68,35 @@ pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage {
if !unique_sizes_gb.is_empty() {
individual_size_gb = unique_sizes_gb[0];
if unique_sizes_gb.len() > 1 {
needed_description_check = true;
failed_reason = "Mixed Sizes".to_owned();
}
}
}
if SIZE_RANGE_REGEX.is_match(&upper_title) {
needed_description_check = true;
failed_reason = "No Size Given".to_owned();
}
if quantity > 1 && upper_title.contains("MIXED") {
needed_description_check = true;
failed_reason = "Mixed Sizes".to_owned();
}
if upper_title.contains("CHECK THE DESCRIPTION")
|| upper_title.contains("CHECK DESCRIPTION")
|| upper_title.contains("SEE DESCRIPTION")
{
if quantity > 1 || size_matches.is_empty() || size_matches.len() > 1 {
needed_description_check = true;
failed_reason = "Mixed Sizes".to_owned();
}
}
if upper_title.contains("READ") {
failed_reason = "Mixed Sizes".to_owned();
}
if individual_size_gb > 0 {
total_gb = individual_size_gb * quantity;
}
if quantity > 1 && total_gb == 0 && !size_matches.is_empty() {
needed_description_check = true;
}
if quantity == 1 && size_matches.len() == 1 && !needed_description_check {
// This condition is implicitly handled
failed_reason = "No size given".to_owned();
}
ParsedStorage {
@@ -106,8 +105,8 @@ pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage {
total_gigabytes: total_gb,
quantity,
individual_size_gigabytes: individual_size_gb,
needed_description_check,
parse_engine: 0,
failed_reason: failed_reason,
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
}
}
@@ -125,8 +124,8 @@ mod tests {
total_gigabytes: 512 * 3,
quantity: 3,
individual_size_gigabytes: 512,
parse_engine: 0,
needed_description_check: false,
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
failed_reason: String::new(),
},
true,
),
@@ -138,8 +137,8 @@ mod tests {
total_gigabytes: 240,
quantity: 1,
individual_size_gigabytes: 240,
parse_engine: 0,
needed_description_check: false,
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
failed_reason: String::new(),
},
true,
),
@@ -151,8 +150,8 @@ mod tests {
total_gigabytes: 1024,
quantity: 1,
individual_size_gigabytes: 1024,
parse_engine: 0,
needed_description_check: true,
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
failed_reason: String::new(),
},
false, // Sadly this one fails :/
),
@@ -164,8 +163,8 @@ mod tests {
total_gigabytes: 7 * 1024,
quantity: 1,
individual_size_gigabytes: 7 * 1024,
parse_engine: 0,
needed_description_check: false,
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
failed_reason: String::new(),
},
true,
),
@@ -177,8 +176,8 @@ mod tests {
total_gigabytes: 6 * 256,
quantity: 6,
individual_size_gigabytes: 256,
parse_engine: 0,
needed_description_check: false,
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
failed_reason: String::new(),
},
true,
),
@@ -190,8 +189,8 @@ mod tests {
total_gigabytes: 1966,
quantity: 1,
individual_size_gigabytes: 1966,
parse_engine: 0,
needed_description_check: false,
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
failed_reason: String::new(),
},
true,
),

160
src/parser_storage_e1.rs Normal file
View File

@@ -0,0 +1,160 @@
use crate::db::{
ParsedLLMStorageResult, ParsedStorage, StorageLLMVersion, StorageParsingEngineVersion,
};
use reqwest::header::{AUTHORIZATION, CONTENT_TYPE};
use serde::{Deserialize, Serialize};
use serde_json::json;
use tracing::error;
// Given this prompt and a string of "(Lot of 6) Samsung MZ-VLB2560 256GB M.2NVMe Internal SSD
// (MZVLB256HBHQ-000H1)", we get 338 input tokens and 36 output tokens. Assuming no caching, then
// Gemini 2.5 Flash Lite at $0.10/M input and $0.40/M output, this would cost $0.0000338 Input,
// $0.0000144 Output, and $0.0000482 Total. Given 30,000 listings this would be $1.446.
const SYSTEM_PROMPT: &str = r#"
You will be given a product listing for one or more storage drives. You will return *ONLY* JSON strictly adhering to the same structure and key names as below. This means no backticks or markdown/markup. You will specify how many storage drives are included in the listing as a number (1, 2, 3, etc), the size in gigabytes of each drive as a number (rounding up if needed, so 1, 2, 3, etc), and lastly if the above cannot be provided due the the listing title being incomplete or confusing, a very short reason why.
Here is an example for a title of "Lot of 2, Crucial P3 Plus 1TB NVMe Internal M.2 SSD (CT1000P3PSSD8) - Barely used!";
{
"quantity": 2,
"gigabytes": 1024
"fail_reason": ""
}
And an example for an unclear title of "Pallet of assorted 128GB to 5TB drives";
{
"quantity": 0,
"gigabytes": 0,
"fail_reason": "multiple mixed sizes"
}
"#;
#[derive(Deserialize, Serialize, Debug, PartialEq, Clone)]
struct LLMParsedResponse {
pub quantity: i64,
pub gigabytes: i64,
pub fail_reason: String,
}
#[derive(Deserialize, Debug)]
struct OpenAIResponse {
choices: Vec<OpenAIChoice>,
}
#[derive(Deserialize, Debug)]
struct OpenAIChoice {
message: OpenAIMessage,
}
#[derive(Deserialize, Debug)]
struct OpenAIMessage {
content: String,
}
#[cfg(test)]
const OPENAI_LLM_URL: &str = "https://badurl.hak8or.com/litellm_api/chat/completions";
#[cfg(not(test))]
const OPENAI_LLM_URL: &str = "https://ai.hak8or.com/litellm_api/chat/completions";
#[cfg(test)]
const OPENAI_LLM_API_KEY: &str = "Bearer sk-YmVlcC1ib29wLWEtcm9ib3Q";
#[cfg(not(test))]
const OPENAI_LLM_API_KEY: &str = "Bearer sk-HMGML94x2ag6ggOoDghSGA";
pub async fn parse_size_and_quantity_llm(
item_id: i64,
title: &str,
) -> Option<ParsedLLMStorageResult> {
let client = reqwest::Client::new();
let req = client
.post(OPENAI_LLM_URL)
.header(CONTENT_TYPE, actix_web::mime::APPLICATION_JSON.to_string())
.header(AUTHORIZATION, OPENAI_LLM_API_KEY)
.body(
json!({
"model": "gemini-2.5-flash-lite",
"reasoning_effort": "disable",
"thinking": {"type": "disabled", "budget_tokens": 0},
"messages": [
{ "role": "system", "content": SYSTEM_PROMPT },
{ "role": "user", "content": title }
]
})
.to_string(),
);
let reply_body = req.send().await.ok()?.text().await.ok()?;
let repl_json: OpenAIResponse = serde_json::from_str(&reply_body).ok()?;
match repl_json.choices.len() {
0 => {
error!("When parsing title, LLM returned ZERO choices");
return None;
}
1 => { /* Nothing to do */ }
a => error!("When parsing title, LLM returned {a}, >1 choices, using first!"),
}
let reply_parsed_storage_json: LLMParsedResponse =
serde_json::from_str(&repl_json.choices[0].message.content).ok()?;
if !reply_parsed_storage_json.fail_reason.is_empty() {
error!(
"Failed parsing item_id:{item_id}, title:{title}, due to reason:{}",
reply_parsed_storage_json.fail_reason
);
}
Some(ParsedLLMStorageResult {
id: 0,
fail_reason: reply_parsed_storage_json.fail_reason.clone(),
gigabytes: reply_parsed_storage_json.gigabytes,
item_id,
quantity: reply_parsed_storage_json.quantity,
title: title.to_owned(),
llm_id: StorageLLMVersion::Gemini2d5Prompt0,
})
}
// Since we can't have a hashmap in a const, and I don't want to play with
// making our parsed result struct contain a CoW string for fail_reason and
// title, we are stuck with this ...
pub fn parse_cached(item_id: i64, title: &str) -> Option<ParsedLLMStorageResult> {
match title {
"Lot of 2 512GB SSD 6gb/s working with 5% wear" => Some(ParsedLLMStorageResult {
id: 0,
item_id: item_id,
fail_reason: "".to_string(),
gigabytes: 512,
quantity: 2,
title: title.to_owned(),
llm_id: StorageLLMVersion::Testing,
}),
"Lot of 2 assorted SSD" => Some(ParsedLLMStorageResult {
id: 0,
fail_reason: "mixed sizes".to_owned(),
gigabytes: 0,
item_id,
quantity: 0,
title: title.to_owned(),
llm_id: StorageLLMVersion::Testing,
}),
_ => None,
}
}
/// Parses size and quantity information from an item title.
pub async fn parse_size_and_quantity(
db: &rusqlite::Connection,
item_id: i64,
title: &str,
) -> Option<ParsedStorage> {
let plsr = parse_size_and_quantity_llm(item_id, title).await?;
plsr.add_or_update(&db);
Some(ParsedStorage {
id: 0,
item: item_id,
total_gigabytes: plsr.quantity * plsr.gigabytes,
quantity: plsr.quantity,
individual_size_gigabytes: plsr.gigabytes,
failed_reason: plsr.fail_reason,
parse_engine: StorageParsingEngineVersion::LLM,
})
}

View File

@@ -5,6 +5,10 @@ URL_PER_PAGE_240="&_ipg=240"
URL_MIN_PRICE_USD_60="&_udlo=60.00"
URL_SEARCHTERM_NONE="&_nkw="
URL_LOCATION_NORTHAMERICA="&LH_PrefLoc=3"
URL_SORTBY_NEWLY_LISTED="&_sop=10"
URL_SORTBY_ENDING_SOONEST="&_sop=1"
URL_CATEGORY_MINIPC_ALLINONE="&_sacat=179"
URL_CATEGORY_SSD="&_sacat=175669"
URL_BASE_LISTING="https://www.ebay.com/sch/i.html?"
if [ -z "${XDG_DATA_HOME}" ]; then
@@ -12,20 +16,141 @@ if [ -z "${XDG_DATA_HOME}" ]; then
exit
fi
DIR_SSDS="$XDG_DATA_HOME/ebay_scraper/raw_scraped/ssd"
mkdir -p "$DIR_SSDS"
if [ ! -s "$DIR_SSDS/url.json" ]; then
URL_CATEGORY_SSD="&_sacat=175669"
URL_SSDS="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_SSD&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1&_sop=10$URL_PER_PAGE_240"
echo "{\"url\": \"$URL_SSDS\"}" > "$DIR_SSDS/url.json"
fi
wget -o "$DIR_SSDS/$(date +%s).html" "$(jq '.url' $DIR_SSDS/url.json)"
# Heh, so Ebay started to block my scraping efforts after a while. I couldn't
# get this working with wget, so in the end I decided to go for this route which
# is quite ugly, but should work in the end. We effectively run a non headless
# version of a browser with various realistic headers and screen dimensions.
# Lastly, we give the page an extra 5 seconds to run any potential javascript
# often used to counter scraping or bots.
fetch_compress_save_html() {
local url="$1"
local output_file="$2"
DIR_MINIPC="$XDG_DATA_HOME/ebay_scraper/raw_scraped/minipc"
mkdir -p "$DIR_MINIPC"
if [ ! -s "$DIR_MINIPC/url.json" ]; then
URL_CATEGORY_MINIPC_ALLINONE="&_sacat=179"
URL_MINIPC="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_MINIPC_ALLINONE&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1&_sop=10$URL_PER_PAGE_240"
echo "{\"url\": \"$URL_MINIPC\"}" > "$DIR_MINIPC/url.json"
fi
wget -o "$DIR_MINIPC/$(date +%s).html" "$(jq '.url' $DIR_MINIPC/url.json)"
echo Fetching $url
xvfb-run --server-args="-screen 0 1024x768x24" \
uv run --with playwright --with playwright-stealth - $url <<'EOF' | zstd -z --ultra -19 -o $output_file
import asyncio
import sys
from playwright.async_api import async_playwright
from playwright_stealth import Stealth
async def main():
async with Stealth().use_async(async_playwright()) as p:
browser = await p.chromium.launch(
executable_path='/usr/bin/chromium',
args=[
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-blink-features=AutomationControlled',
"--window-size=1901,1018"
],
headless=False
)
# Create context with user agent
context = await browser.new_context(
color_scheme=r"light",
locale=r"en-US,en;q=0.9",
user_agent=r"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36",
timezone_id=r"America/New_York",
extra_http_headers={
"origin": "https://www.ebay.com",
"accept": "*/*",
"accept-encoding": "gzip, deflate, br, zstd",
"cache-control": "no-cache",
"accept-language": "en-US,en;q=0.9"
}
)
page = await context.new_page()
await page.goto(sys.argv[1], wait_until="domcontentloaded")
await page.wait_for_timeout(5000)
print(await page.content())
await browser.close()
asyncio.run(main())
EOF
}
fetch() {
local name="$1"
local url_param="$2"
DIR="$XDG_DATA_HOME/scraper/raw_scraped/$name"
mkdir -p "$DIR"
if [ ! -s "$DIR/url.json" ]; then
local URL="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$url_param&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1$URL_PER_PAGE_240"
echo "{\"url\": \"$URL\"}" > "$DIR/url.json"
fi
URL_NEWEST="$(jq '.url' $DIR/url.json --raw-output)$URL_SORTBY_NEWLY_LISTED"
fetch_compress_save_html $URL_NEWEST "$DIR/$(date +%s).html.zst"
URL_ENDING="$(jq '.url' $DIR/url.json --raw-output)$URL_SORTBY_ENDING_SOONEST"
fetch_compress_save_html $URL_ENDING "$DIR/$(date +%s).html.zst"
}
fetch "ssd" "$URL_CATEGORY_SSD"
fetch "minipc" "$URL_CATEGORY_MINIPC_ALLINONE"
# If needing to do a mass compression;
# fd '\.html$' -x zstd -z --ultra -19 -o {}.zst {}
# If needing to purge bogus downloads
# fd --size -100K .html.zst -x ls -lah {}
# fd --size -100K .html.zst -x rm {}
# Level compression analysis;
#
# A single scraped result;
# for lvl in $(seq 3 22); zstd --compress --ultra -o 1755012328.html.zst$lvl -$lvl 1755012328.html; end
# 1755012328.html : 9.04% ( 2.60 MiB => 240 KiB, 1755012328.html.zst3)
# 1755012328.html : 9.04% ( 2.60 MiB => 240 KiB, 1755012328.html.zst4)
# 1755012328.html : 8.80% ( 2.60 MiB => 234 KiB, 1755012328.html.zst5)
# 1755012328.html : 8.58% ( 2.60 MiB => 228 KiB, 1755012328.html.zst6)
# 1755012328.html : 8.54% ( 2.60 MiB => 227 KiB, 1755012328.html.zst7)
# 1755012328.html : 8.45% ( 2.60 MiB => 225 KiB, 1755012328.html.zst8)
# 1755012328.html : 8.34% ( 2.60 MiB => 222 KiB, 1755012328.html.zst9)
# 1755012328.html : 8.30% ( 2.60 MiB => 221 KiB, 1755012328.html.zst10)
# 1755012328.html : 8.28% ( 2.60 MiB => 220 KiB, 1755012328.html.zst11)
# 1755012328.html : 8.28% ( 2.60 MiB => 220 KiB, 1755012328.html.zst12)
# 1755012328.html : 8.32% ( 2.60 MiB => 221 KiB, 1755012328.html.zst13)
# 1755012328.html : 8.29% ( 2.60 MiB => 221 KiB, 1755012328.html.zst14)
# 1755012328.html : 8.25% ( 2.60 MiB => 219 KiB, 1755012328.html.zst15)
# 1755012328.html : 8.28% ( 2.60 MiB => 220 KiB, 1755012328.html.zst16)
# 1755012328.html : 8.20% ( 2.60 MiB => 218 KiB, 1755012328.html.zst17)
# 1755012328.html : 8.23% ( 2.60 MiB => 219 KiB, 1755012328.html.zst18)
# 1755012328.html : 7.99% ( 2.60 MiB => 213 KiB, 1755012328.html.zst19)
# 1755012328.html : 7.99% ( 2.60 MiB => 213 KiB, 1755012328.html.zst20)
# 1755012328.html : 7.93% ( 2.60 MiB => 211 KiB, 1755012328.html.zst21)
# 1755012328.html : 7.91% ( 2.60 MiB => 211 KiB, 1755012328.html.zst22)
#
# Lets see if we get benefits tar'ing and them compressing;
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755012328.html
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755012331.html
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755015932.html
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755015929.html
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755019567.html
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755019564.html
# -rw-r--r-- 1 hak8or users 16M Sep 1 12:23 175501.tar
# ➜ for lvl in $(seq 3 22); zstd --compress --ultra -o 175501.tar.$lvl -$lvl 175501.tar; end
# 175501.tar : 8.91% ( 15.6 MiB => 1.39 MiB, 175501.tar.3)
# 175501.tar : 8.92% ( 15.6 MiB => 1.39 MiB, 175501.tar.4)
# 175501.tar : 8.65% ( 15.6 MiB => 1.35 MiB, 175501.tar.5)
# 175501.tar : 8.42% ( 15.6 MiB => 1.31 MiB, 175501.tar.6)
# 175501.tar : 8.36% ( 15.6 MiB => 1.30 MiB, 175501.tar.7)
# 175501.tar : 8.25% ( 15.6 MiB => 1.28 MiB, 175501.tar.8)
# 175501.tar : 5.36% ( 15.6 MiB => 854 KiB, 175501.tar.9)
# 175501.tar : 5.32% ( 15.6 MiB => 847 KiB, 175501.tar.10)
# 175501.tar : 5.30% ( 15.6 MiB => 844 KiB, 175501.tar.11)
# 175501.tar : 5.30% ( 15.6 MiB => 844 KiB, 175501.tar.12)
# 175501.tar : 5.48% ( 15.6 MiB => 872 KiB, 175501.tar.13)
# 175501.tar : 5.42% ( 15.6 MiB => 864 KiB, 175501.tar.14)
# 175501.tar : 5.19% ( 15.6 MiB => 828 KiB, 175501.tar.15)
# 175501.tar : 5.31% ( 15.6 MiB => 845 KiB, 175501.tar.16)
# 175501.tar : 5.01% ( 15.6 MiB => 798 KiB, 175501.tar.17)
# 175501.tar : 5.04% ( 15.6 MiB => 803 KiB, 175501.tar.18)
# 175501.tar : 4.84% ( 15.6 MiB => 771 KiB, 175501.tar.19)
# 175501.tar : 4.79% ( 15.6 MiB => 764 KiB, 175501.tar.20)
# 175501.tar : 4.74% ( 15.6 MiB => 755 KiB, 175501.tar.21)
# 175501.tar : 4.73% ( 15.6 MiB => 753 KiB, 175501.tar.22)

File diff suppressed because one or more lines are too long

Binary file not shown.