Compare commits
4 Commits
parallel_s
...
parser_llm
Author | SHA1 | Date | |
---|---|---|---|
cb8025becd | |||
0039078f41 | |||
4ae1622f02 | |||
b538dd8012 |
739
Cargo.lock
generated
739
Cargo.lock
generated
@@ -39,8 +39,8 @@ dependencies = [
|
||||
"flate2",
|
||||
"foldhash",
|
||||
"futures-core",
|
||||
"h2",
|
||||
"http",
|
||||
"h2 0.3.26",
|
||||
"http 0.2.12",
|
||||
"httparse",
|
||||
"httpdate",
|
||||
"itoa",
|
||||
@@ -76,7 +76,7 @@ checksum = "13d324164c51f63867b57e73ba5936ea151b8a41a1d23d1031eeb9f70d0236f8"
|
||||
dependencies = [
|
||||
"bytestring",
|
||||
"cfg-if",
|
||||
"http",
|
||||
"http 0.2.12",
|
||||
"regex",
|
||||
"regex-lite",
|
||||
"serde",
|
||||
@@ -289,6 +289,12 @@ dependencies = [
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "atomic-waker"
|
||||
version = "1.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0"
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.5.0"
|
||||
@@ -491,6 +497,16 @@ dependencies = [
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "core-foundation"
|
||||
version = "0.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f"
|
||||
dependencies = [
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "core-foundation-sys"
|
||||
version = "0.8.7"
|
||||
@@ -679,19 +695,21 @@ dependencies = [
|
||||
"chrono",
|
||||
"clap",
|
||||
"dirs",
|
||||
"futures",
|
||||
"lazy_static",
|
||||
"liblzma",
|
||||
"num_enum",
|
||||
"rayon",
|
||||
"regex",
|
||||
"reqwest",
|
||||
"rusqlite",
|
||||
"scraper",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"similar-asserts",
|
||||
"strum",
|
||||
"test-log",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"zstd",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -748,6 +766,16 @@ version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
|
||||
|
||||
[[package]]
|
||||
name = "errno"
|
||||
version = "0.3.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fallible-iterator"
|
||||
version = "0.3.0"
|
||||
@@ -760,6 +788,12 @@ version = "0.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a"
|
||||
|
||||
[[package]]
|
||||
name = "fastrand"
|
||||
version = "2.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
|
||||
|
||||
[[package]]
|
||||
name = "flate2"
|
||||
version = "1.1.2"
|
||||
@@ -782,6 +816,21 @@ version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2"
|
||||
|
||||
[[package]]
|
||||
name = "foreign-types"
|
||||
version = "0.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1"
|
||||
dependencies = [
|
||||
"foreign-types-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "foreign-types-shared"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b"
|
||||
|
||||
[[package]]
|
||||
name = "form_urlencoded"
|
||||
version = "1.2.1"
|
||||
@@ -801,12 +850,65 @@ dependencies = [
|
||||
"new_debug_unreachable",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures"
|
||||
version = "0.3.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-executor",
|
||||
"futures-io",
|
||||
"futures-sink",
|
||||
"futures-task",
|
||||
"futures-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-channel"
|
||||
version = "0.3.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-core"
|
||||
version = "0.3.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
|
||||
|
||||
[[package]]
|
||||
name = "futures-executor"
|
||||
version = "0.3.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-task",
|
||||
"futures-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-io"
|
||||
version = "0.3.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
|
||||
|
||||
[[package]]
|
||||
name = "futures-macro"
|
||||
version = "0.3.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-sink"
|
||||
version = "0.3.31"
|
||||
@@ -825,10 +927,16 @@ version = "0.3.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-io",
|
||||
"futures-macro",
|
||||
"futures-sink",
|
||||
"futures-task",
|
||||
"memchr",
|
||||
"pin-project-lite",
|
||||
"pin-utils",
|
||||
"slab",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -899,7 +1007,26 @@ dependencies = [
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
"futures-util",
|
||||
"http",
|
||||
"http 0.2.12",
|
||||
"indexmap",
|
||||
"slab",
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"tracing",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "h2"
|
||||
version = "0.4.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f3c0b69cfcb4e1b9f1bf2f53f95f766e4661169728ec61cd3fe5a0166f2d1386"
|
||||
dependencies = [
|
||||
"atomic-waker",
|
||||
"bytes",
|
||||
"fnv",
|
||||
"futures-core",
|
||||
"futures-sink",
|
||||
"http 1.3.1",
|
||||
"indexmap",
|
||||
"slab",
|
||||
"tokio",
|
||||
@@ -954,6 +1081,40 @@ dependencies = [
|
||||
"itoa",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "http"
|
||||
version = "1.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"fnv",
|
||||
"itoa",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "http-body"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"http 1.3.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "http-body-util"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"futures-core",
|
||||
"http 1.3.1",
|
||||
"http-body",
|
||||
"pin-project-lite",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "httparse"
|
||||
version = "1.10.1"
|
||||
@@ -966,6 +1127,86 @@ version = "1.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9"
|
||||
|
||||
[[package]]
|
||||
name = "hyper"
|
||||
version = "1.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eb3aa54a13a0dfe7fbe3a59e0c76093041720fdc77b110cc0fc260fafb4dc51e"
|
||||
dependencies = [
|
||||
"atomic-waker",
|
||||
"bytes",
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"h2 0.4.12",
|
||||
"http 1.3.1",
|
||||
"http-body",
|
||||
"httparse",
|
||||
"itoa",
|
||||
"pin-project-lite",
|
||||
"pin-utils",
|
||||
"smallvec",
|
||||
"tokio",
|
||||
"want",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-rustls"
|
||||
version = "0.27.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58"
|
||||
dependencies = [
|
||||
"http 1.3.1",
|
||||
"hyper",
|
||||
"hyper-util",
|
||||
"rustls",
|
||||
"rustls-pki-types",
|
||||
"tokio",
|
||||
"tokio-rustls",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-tls"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"http-body-util",
|
||||
"hyper",
|
||||
"hyper-util",
|
||||
"native-tls",
|
||||
"tokio",
|
||||
"tokio-native-tls",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hyper-util"
|
||||
version = "0.1.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8d9b05277c7e8da2c93a568989bb6207bef0112e8d17df7a6eda4a3cf143bc5e"
|
||||
dependencies = [
|
||||
"base64",
|
||||
"bytes",
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
"http 1.3.1",
|
||||
"http-body",
|
||||
"hyper",
|
||||
"ipnet",
|
||||
"libc",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"socket2",
|
||||
"system-configuration",
|
||||
"tokio",
|
||||
"tower-service",
|
||||
"tracing",
|
||||
"windows-registry",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "iana-time-zone"
|
||||
version = "0.1.63"
|
||||
@@ -1113,6 +1354,22 @@ dependencies = [
|
||||
"hashbrown",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ipnet"
|
||||
version = "2.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130"
|
||||
|
||||
[[package]]
|
||||
name = "iri-string"
|
||||
version = "0.7.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dbc5ebe9c3a1a7a5127f920a418f7585e9e758e911d0466ed004f393b0e380b2"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "is_terminal_polyfill"
|
||||
version = "1.70.1"
|
||||
@@ -1163,26 +1420,6 @@ version = "0.2.174"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776"
|
||||
|
||||
[[package]]
|
||||
name = "liblzma"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0791ab7e08ccc8e0ce893f6906eb2703ed8739d8e89b57c0714e71bad09024c8"
|
||||
dependencies = [
|
||||
"liblzma-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "liblzma-sys"
|
||||
version = "0.4.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "01b9596486f6d60c3bbe644c0e1be1aa6ccc472ad630fe8927b456973d7cb736"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"pkg-config",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libredox"
|
||||
version = "0.1.3"
|
||||
@@ -1204,6 +1441,12 @@ dependencies = [
|
||||
"vcpkg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "linux-raw-sys"
|
||||
version = "0.9.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12"
|
||||
|
||||
[[package]]
|
||||
name = "litemap"
|
||||
version = "0.8.0"
|
||||
@@ -1316,6 +1559,23 @@ dependencies = [
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "native-tls"
|
||||
version = "0.2.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"log",
|
||||
"openssl",
|
||||
"openssl-probe",
|
||||
"openssl-sys",
|
||||
"schannel",
|
||||
"security-framework",
|
||||
"security-framework-sys",
|
||||
"tempfile",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "new_debug_unreachable"
|
||||
version = "1.0.6"
|
||||
@@ -1347,6 +1607,28 @@ dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num_enum"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a973b4e44ce6cad84ce69d797acf9a044532e4184c4f267913d1b546a0727b7a"
|
||||
dependencies = [
|
||||
"num_enum_derive",
|
||||
"rustversion",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num_enum_derive"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77e878c846a8abae00dd069496dbe8751b16ac1c3d6bd2a7283a938e8228f90d"
|
||||
dependencies = [
|
||||
"proc-macro-crate",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "object"
|
||||
version = "0.36.7"
|
||||
@@ -1368,6 +1650,50 @@ version = "1.70.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad"
|
||||
|
||||
[[package]]
|
||||
name = "openssl"
|
||||
version = "0.10.73"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8505734d46c8ab1e19a1dce3aef597ad87dcb4c37e7188231769bd6bd51cebf8"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"cfg-if",
|
||||
"foreign-types",
|
||||
"libc",
|
||||
"once_cell",
|
||||
"openssl-macros",
|
||||
"openssl-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openssl-macros"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openssl-probe"
|
||||
version = "0.1.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e"
|
||||
|
||||
[[package]]
|
||||
name = "openssl-sys"
|
||||
version = "0.9.109"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "90096e2e47630d78b7d1c20952dc621f957103f8bc2c8359ec81290d75238571"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"libc",
|
||||
"pkg-config",
|
||||
"vcpkg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "option-ext"
|
||||
version = "0.2.0"
|
||||
@@ -1509,6 +1835,15 @@ version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-crate"
|
||||
version = "3.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "edce586971a4dfaa28950c6f18ed55e0406c1ab88bbce2c6f6293a7aaba73d35"
|
||||
dependencies = [
|
||||
"toml_edit",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.95"
|
||||
@@ -1667,6 +2002,62 @@ version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
|
||||
|
||||
[[package]]
|
||||
name = "reqwest"
|
||||
version = "0.12.23"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d429f34c8092b2d42c7c93cec323bb4adeb7c67698f70839adec842ec10c7ceb"
|
||||
dependencies = [
|
||||
"base64",
|
||||
"bytes",
|
||||
"encoding_rs",
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
"h2 0.4.12",
|
||||
"http 1.3.1",
|
||||
"http-body",
|
||||
"http-body-util",
|
||||
"hyper",
|
||||
"hyper-rustls",
|
||||
"hyper-tls",
|
||||
"hyper-util",
|
||||
"js-sys",
|
||||
"log",
|
||||
"mime",
|
||||
"native-tls",
|
||||
"percent-encoding",
|
||||
"pin-project-lite",
|
||||
"rustls-pki-types",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"serde_urlencoded",
|
||||
"sync_wrapper",
|
||||
"tokio",
|
||||
"tokio-native-tls",
|
||||
"tower",
|
||||
"tower-http",
|
||||
"tower-service",
|
||||
"url",
|
||||
"wasm-bindgen",
|
||||
"wasm-bindgen-futures",
|
||||
"web-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ring"
|
||||
version = "0.17.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"cfg-if",
|
||||
"getrandom 0.2.16",
|
||||
"libc",
|
||||
"untrusted",
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rusqlite"
|
||||
version = "0.36.0"
|
||||
@@ -1688,6 +2079,52 @@ version = "0.1.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "989e6739f80c4ad5b13e0fd7fe89531180375b18520cc8c82080e4dc4035b84f"
|
||||
|
||||
[[package]]
|
||||
name = "rustix"
|
||||
version = "1.0.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "11181fbabf243db407ef8df94a6ce0b2f9a733bd8be4ad02b4eda9602296cac8"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"errno",
|
||||
"libc",
|
||||
"linux-raw-sys",
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls"
|
||||
version = "0.23.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c0ebcbd2f03de0fc1122ad9bb24b127a5a6cd51d72604a3f3c50ac459762b6cc"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"rustls-pki-types",
|
||||
"rustls-webpki",
|
||||
"subtle",
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls-pki-types"
|
||||
version = "1.12.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79"
|
||||
dependencies = [
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustls-webpki"
|
||||
version = "0.103.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0a17884ae0c1b773f1ccd2bd4a8c72f16da897310a98b0e84bf349ad5ead92fc"
|
||||
dependencies = [
|
||||
"ring",
|
||||
"rustls-pki-types",
|
||||
"untrusted",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustversion"
|
||||
version = "1.0.21"
|
||||
@@ -1700,6 +2137,15 @@ version = "1.0.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
|
||||
|
||||
[[package]]
|
||||
name = "schannel"
|
||||
version = "0.1.27"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1f29ebaa345f945cec9fbbc532eb307f0fdad8161f281b6369539c8d84876b3d"
|
||||
dependencies = [
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "scopeguard"
|
||||
version = "1.2.0"
|
||||
@@ -1721,6 +2167,29 @@ dependencies = [
|
||||
"tendril",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "security-framework"
|
||||
version = "2.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"core-foundation",
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
"security-framework-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "security-framework-sys"
|
||||
version = "2.14.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "49db231d56a190491cb4aeda9527f1ad45345af50b0851622a7adb8c03b01c32"
|
||||
dependencies = [
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "selectors"
|
||||
version = "0.26.0"
|
||||
@@ -1914,27 +2383,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
||||
|
||||
[[package]]
|
||||
name = "strum"
|
||||
version = "0.27.1"
|
||||
name = "subtle"
|
||||
version = "2.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f64def088c51c9510a8579e3c5d67c65349dcf755e5479ad3d010aa6454e2c32"
|
||||
dependencies = [
|
||||
"phf",
|
||||
"strum_macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "strum_macros"
|
||||
version = "0.27.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c77a8c5abcaf0f9ce05d62342b7d298c346515365c36b673df4ebe3ced01fde8"
|
||||
dependencies = [
|
||||
"heck",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"rustversion",
|
||||
"syn",
|
||||
]
|
||||
checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
@@ -1947,6 +2399,15 @@ dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sync_wrapper"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "synstructure"
|
||||
version = "0.13.2"
|
||||
@@ -1958,6 +2419,40 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "system-configuration"
|
||||
version = "0.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"core-foundation",
|
||||
"system-configuration-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "system-configuration-sys"
|
||||
version = "0.6.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4"
|
||||
dependencies = [
|
||||
"core-foundation-sys",
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tempfile"
|
||||
version = "3.21.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "15b61f8f20e3a6f7e0649d825294eaf317edce30f82cf6026e7e4cb9222a7d1e"
|
||||
dependencies = [
|
||||
"fastrand",
|
||||
"getrandom 0.3.3",
|
||||
"once_cell",
|
||||
"rustix",
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tendril"
|
||||
version = "0.4.3"
|
||||
@@ -2078,6 +2573,26 @@ dependencies = [
|
||||
"windows-sys 0.52.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-native-tls"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2"
|
||||
dependencies = [
|
||||
"native-tls",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-rustls"
|
||||
version = "0.26.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b"
|
||||
dependencies = [
|
||||
"rustls",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio-util"
|
||||
version = "0.7.15"
|
||||
@@ -2091,6 +2606,68 @@ dependencies = [
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_datetime"
|
||||
version = "0.6.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c"
|
||||
|
||||
[[package]]
|
||||
name = "toml_edit"
|
||||
version = "0.22.27"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a"
|
||||
dependencies = [
|
||||
"indexmap",
|
||||
"toml_datetime",
|
||||
"winnow",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tower"
|
||||
version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-util",
|
||||
"pin-project-lite",
|
||||
"sync_wrapper",
|
||||
"tokio",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tower-http"
|
||||
version = "0.6.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"bytes",
|
||||
"futures-util",
|
||||
"http 1.3.1",
|
||||
"http-body",
|
||||
"iri-string",
|
||||
"pin-project-lite",
|
||||
"tower",
|
||||
"tower-layer",
|
||||
"tower-service",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tower-layer"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e"
|
||||
|
||||
[[package]]
|
||||
name = "tower-service"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3"
|
||||
|
||||
[[package]]
|
||||
name = "tracing"
|
||||
version = "0.1.41"
|
||||
@@ -2153,6 +2730,12 @@ dependencies = [
|
||||
"tracing-log",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "try-lock"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b"
|
||||
|
||||
[[package]]
|
||||
name = "typenum"
|
||||
version = "1.18.0"
|
||||
@@ -2183,6 +2766,12 @@ version = "0.2.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853"
|
||||
|
||||
[[package]]
|
||||
name = "untrusted"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
|
||||
|
||||
[[package]]
|
||||
name = "url"
|
||||
version = "2.5.4"
|
||||
@@ -2230,6 +2819,15 @@ version = "0.9.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
||||
|
||||
[[package]]
|
||||
name = "want"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e"
|
||||
dependencies = [
|
||||
"try-lock",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.11.1+wasi-snapshot-preview1"
|
||||
@@ -2271,6 +2869,19 @@ dependencies = [
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-futures"
|
||||
version = "0.4.50"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"js-sys",
|
||||
"once_cell",
|
||||
"wasm-bindgen",
|
||||
"web-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro"
|
||||
version = "0.2.100"
|
||||
@@ -2303,6 +2914,16 @@ dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "web-sys"
|
||||
version = "0.3.77"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2"
|
||||
dependencies = [
|
||||
"js-sys",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "winapi"
|
||||
version = "0.3.9"
|
||||
@@ -2366,6 +2987,17 @@ version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a"
|
||||
|
||||
[[package]]
|
||||
name = "windows-registry"
|
||||
version = "0.5.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b8a9ed28765efc97bbc954883f4e6796c33a06546ebafacbabee9696967499e"
|
||||
dependencies = [
|
||||
"windows-link",
|
||||
"windows-result",
|
||||
"windows-strings",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-result"
|
||||
version = "0.3.4"
|
||||
@@ -2466,6 +3098,15 @@ version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
||||
|
||||
[[package]]
|
||||
name = "winnow"
|
||||
version = "0.7.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "21a0236b59786fed61e2a80582dd500fe61f18b5dca67a4a067d0bc9039339cf"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wit-bindgen-rt"
|
||||
version = "0.39.0"
|
||||
@@ -2546,6 +3187,12 @@ dependencies = [
|
||||
"synstructure",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zeroize"
|
||||
version = "1.8.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde"
|
||||
|
||||
[[package]]
|
||||
name = "zerotrie"
|
||||
version = "0.2.2"
|
||||
|
@@ -8,18 +8,20 @@ actix-web = "4.11.0"
|
||||
chrono = { version = "0.4.41", features = ["serde"] }
|
||||
clap = { version = "4.5.40", features = ["derive"] }
|
||||
dirs = "6.0.0"
|
||||
futures = "0.3.31"
|
||||
lazy_static = "1.5.0"
|
||||
liblzma = "0.4.2"
|
||||
num_enum = "0.7.4"
|
||||
rayon = "1.10.0"
|
||||
regex = "1.11.1"
|
||||
reqwest = { version = "0.12.23", features = ["blocking"] }
|
||||
rusqlite = { version = "0.36.0", features = ["bundled", "chrono"] }
|
||||
scraper = "0.23.1"
|
||||
serde = { version = "1.0.219", features = ["derive"] }
|
||||
serde_json = "1.0.140"
|
||||
strum = { version = "0.27.1", features = ["std", "derive", "phf", "strum_macros"] }
|
||||
test-log = { version = "0.2.17", features = ["trace"] }
|
||||
tracing = { version = "0.1.41", features = ["attributes"] }
|
||||
tracing-subscriber = { version = "0.3.19", features = ["env-filter"] }
|
||||
zstd = "0.13.3"
|
||||
|
||||
[dev-dependencies]
|
||||
similar-asserts = "1.7.0"
|
||||
|
440
readme.md
440
readme.md
@@ -3,352 +3,120 @@
|
||||
This is a dumb little tool which ingests raw HTML files, does some parsing on them, and serves the results over a web API.
|
||||
|
||||
```bash
|
||||
export URL_BASE="localhost:9876"; \
|
||||
echo run0 && http POST "$URL_BASE/category/ssd/discover" && \
|
||||
echo run1 && http POST "$URL_BASE/category/ssd/parse" && \
|
||||
echo run2 && http GET "$URL_BASE/category/ssd/parse" && \
|
||||
echo run3 && http POST "$URL_BASE/listing/parse" && \
|
||||
echo run4 && http GET "$URL_BASE/listings" since:=10099 limit:=10 cents_per_tbytes_max:=900 && \
|
||||
echo run5 && http GET "$URL_BASE/listing/267267322597" && \
|
||||
echo run6 && http GET "$URL_BASE/listing/267267322597/history" &&
|
||||
echo run7 && http GET "$URL_BASE/listing/267267322597/parsed"
|
||||
export URL_BASE="http://scraper.homelab.hak8or.com:8080"; \
|
||||
echo run0 && http POST "$URL_BASE/page/parse/ssd" && \
|
||||
echo run1 && http POST "$URL_BASE/listing/parse" && \
|
||||
echo run2 && http GET "$URL_BASE/listing/since/12345678/2" && \
|
||||
echo run3 && http GET "$URL_BASE/listing/388484391867" && \
|
||||
echo run4 && http GET "$URL_BASE/listing/286605201240/history"
|
||||
```
|
||||
|
||||
```
|
||||
run0
|
||||
HTTP/1.1 200 OK
|
||||
content-length: 0
|
||||
content-type: text/plain; charset=utf-8
|
||||
date: Thu, 10 Jul 2025 04:26:49 GMT
|
||||
|
||||
|
||||
|
||||
|
||||
run1
|
||||
HTTP/1.1 200 OK
|
||||
content-length: 0
|
||||
content-type: text/plain; charset=utf-8
|
||||
date: Thu, 10 Jul 2025 04:26:49 GMT
|
||||
|
||||
|
||||
|
||||
|
||||
run2
|
||||
HTTP/1.1 200 OK
|
||||
content-length: 36
|
||||
content-type: application/json
|
||||
date: Thu, 10 Jul 2025 04:26:49 GMT
|
||||
|
||||
[
|
||||
[
|
||||
"PendingParse",
|
||||
1,
|
||||
1
|
||||
],
|
||||
[
|
||||
"Ready",
|
||||
0,
|
||||
1
|
||||
]
|
||||
]
|
||||
|
||||
|
||||
run3
|
||||
HTTP/1.1 200 OK
|
||||
content-length: 2
|
||||
content-type: application/json
|
||||
date: Thu, 10 Jul 2025 04:26:49 GMT
|
||||
|
||||
62
|
||||
|
||||
|
||||
run4
|
||||
HTTP/1.1 200 OK
|
||||
content-length: 4232
|
||||
content-type: application/json
|
||||
date: Thu, 10 Jul 2025 04:26:49 GMT
|
||||
|
||||
[
|
||||
{
|
||||
"history": [
|
||||
{
|
||||
"category": "ssd",
|
||||
"current_bid_usd_cents": 1260,
|
||||
"item": 286605201240,
|
||||
"timestamp": "2025-06-19T21:44:23Z"
|
||||
}
|
||||
],
|
||||
"listing": {
|
||||
"buy_it_now_price_cents": null,
|
||||
"has_best_offer": true,
|
||||
"id": 5,
|
||||
"image_url": "https://i.ebayimg.com/images/g/3NoAAeSwPrtoDb1O/s-l500.webp",
|
||||
"item_id": 286605201240,
|
||||
"title": "Fanxiang M.2 SSD 1TB NVMe PCIe Gen 3x 4 M2 Internal Solid State Drive 3500MB/s"
|
||||
},
|
||||
"parsed": [
|
||||
{
|
||||
"id": 5,
|
||||
"individual_size_gigabytes": 1024,
|
||||
"item": 286605201240,
|
||||
"needed_description_check": false,
|
||||
"parse_engine": 0,
|
||||
"quantity": 1,
|
||||
"total_gigabytes": 1024
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"history": [
|
||||
{
|
||||
"category": "ssd",
|
||||
"current_bid_usd_cents": 2400,
|
||||
"item": 177133381123,
|
||||
"timestamp": "2025-06-19T21:44:23Z"
|
||||
}
|
||||
],
|
||||
"listing": {
|
||||
"buy_it_now_price_cents": null,
|
||||
"has_best_offer": false,
|
||||
"id": 22,
|
||||
"image_url": "https://i.ebayimg.com/images/g/-VMAAOSwaX1oNyx4/s-l500.webp",
|
||||
"item_id": 177133381123,
|
||||
"title": "SanDisk professional G-DRIVE SSD 2TB, A+ condition"
|
||||
},
|
||||
"parsed": [
|
||||
{
|
||||
"id": 22,
|
||||
"individual_size_gigabytes": 2048,
|
||||
"item": 177133381123,
|
||||
"needed_description_check": false,
|
||||
"parse_engine": 0,
|
||||
"quantity": 1,
|
||||
"total_gigabytes": 2048
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"history": [
|
||||
{
|
||||
"category": "ssd",
|
||||
"current_bid_usd_cents": 3108,
|
||||
"item": 187263467837,
|
||||
"timestamp": "2025-06-19T21:44:23Z"
|
||||
}
|
||||
],
|
||||
"listing": {
|
||||
"buy_it_now_price_cents": null,
|
||||
"has_best_offer": false,
|
||||
"id": 35,
|
||||
"image_url": "https://i.ebayimg.com/images/g/hn8AAOSw1hJoNrJm/s-l500.webp",
|
||||
"item_id": 187263467837,
|
||||
"title": "Used Fanxiang S880 4TB SSD NVME M.2 SSD PCIe 4x4 7300MBS Solid State Drive"
|
||||
},
|
||||
"parsed": [
|
||||
{
|
||||
"id": 35,
|
||||
"individual_size_gigabytes": 4096,
|
||||
"item": 187263467837,
|
||||
"needed_description_check": false,
|
||||
"parse_engine": 0,
|
||||
"quantity": 1,
|
||||
"total_gigabytes": 4096
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"history": [
|
||||
{
|
||||
"category": "ssd",
|
||||
"current_bid_usd_cents": 1000,
|
||||
"item": 267267367821,
|
||||
"timestamp": "2025-06-19T21:44:23Z"
|
||||
}
|
||||
],
|
||||
"listing": {
|
||||
"buy_it_now_price_cents": null,
|
||||
"has_best_offer": false,
|
||||
"id": 37,
|
||||
"image_url": "https://i.ebayimg.com/images/g/Cr8AAOSwXY1oN6m8/s-l500.webp",
|
||||
"item_id": 267267367821,
|
||||
"title": "(Lot of 6) Samsung MZ-VLB2560 256GB M.2 NVMe Internal SSD (MZVLB256HBHQ-000H1)"
|
||||
},
|
||||
"parsed": [
|
||||
{
|
||||
"id": 37,
|
||||
"individual_size_gigabytes": 256,
|
||||
"item": 267267367821,
|
||||
"needed_description_check": false,
|
||||
"parse_engine": 0,
|
||||
"quantity": 6,
|
||||
"total_gigabytes": 1536
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"history": [
|
||||
{
|
||||
"category": "ssd",
|
||||
"current_bid_usd_cents": 4600,
|
||||
"item": 187263491149,
|
||||
"timestamp": "2025-06-19T21:44:23Z"
|
||||
}
|
||||
],
|
||||
"listing": {
|
||||
"buy_it_now_price_cents": null,
|
||||
"has_best_offer": false,
|
||||
"id": 44,
|
||||
"image_url": "https://i.ebayimg.com/images/g/v2EAAOSwg9poNrTr/s-l500.webp",
|
||||
"item_id": 187263491149,
|
||||
"title": "Used Silicon Power 4TB US75 Nvme PCIe Gen4x4 M.2 2280 SSD R/W Up to 7000/6500 MB"
|
||||
},
|
||||
"parsed": [
|
||||
{
|
||||
"id": 44,
|
||||
"individual_size_gigabytes": 4096,
|
||||
"item": 187263491149,
|
||||
"needed_description_check": false,
|
||||
"parse_engine": 0,
|
||||
"quantity": 1,
|
||||
"total_gigabytes": 4096
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"history": [
|
||||
{
|
||||
"category": "ssd",
|
||||
"current_bid_usd_cents": 1000,
|
||||
"item": 267267351339,
|
||||
"timestamp": "2025-06-19T21:44:23Z"
|
||||
}
|
||||
],
|
||||
"listing": {
|
||||
"buy_it_now_price_cents": null,
|
||||
"has_best_offer": false,
|
||||
"id": 46,
|
||||
"image_url": "https://i.ebayimg.com/images/g/z8EAAOSwyKZoN6TW/s-l500.webp",
|
||||
"item_id": 267267351339,
|
||||
"title": "(Lot of 6) Used -Micron MTFDDAV256TBN 256GB, M.2 2280 Solid State Drive"
|
||||
},
|
||||
"parsed": [
|
||||
{
|
||||
"id": 46,
|
||||
"individual_size_gigabytes": 256,
|
||||
"item": 267267351339,
|
||||
"needed_description_check": false,
|
||||
"parse_engine": 0,
|
||||
"quantity": 6,
|
||||
"total_gigabytes": 1536
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"history": [
|
||||
{
|
||||
"category": "ssd",
|
||||
"current_bid_usd_cents": 99,
|
||||
"item": 306325087069,
|
||||
"timestamp": "2025-06-19T21:44:23Z"
|
||||
}
|
||||
],
|
||||
"listing": {
|
||||
"buy_it_now_price_cents": null,
|
||||
"has_best_offer": false,
|
||||
"id": 59,
|
||||
"image_url": "https://i.ebayimg.com/images/g/zuUAAOSwIoJoN5yC/s-l500.webp",
|
||||
"item_id": 306325087069,
|
||||
"title": "T298 ~ HP OEM Desktop Z240 Workstation Heatsink w NVMe M.2 256GB SSD 826414-001"
|
||||
},
|
||||
"parsed": [
|
||||
{
|
||||
"id": 59,
|
||||
"individual_size_gigabytes": 256,
|
||||
"item": 306325087069,
|
||||
"needed_description_check": false,
|
||||
"parse_engine": 0,
|
||||
"quantity": 1,
|
||||
"total_gigabytes": 256
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"history": [
|
||||
{
|
||||
"category": "ssd",
|
||||
"current_bid_usd_cents": 1000,
|
||||
"item": 267267322597,
|
||||
"timestamp": "2025-06-19T21:44:23Z"
|
||||
}
|
||||
],
|
||||
"listing": {
|
||||
"buy_it_now_price_cents": null,
|
||||
"has_best_offer": false,
|
||||
"id": 60,
|
||||
"image_url": "https://i.ebayimg.com/images/g/r8YAAOSwlkdoN5uW/s-l500.webp",
|
||||
"item_id": 267267322597,
|
||||
"title": "(Lot of 5) Used - Micro 1100 256GB SATA III 2.5\" SSD MTFDDAK256TBN"
|
||||
},
|
||||
"parsed": [
|
||||
{
|
||||
"id": 60,
|
||||
"individual_size_gigabytes": 256,
|
||||
"item": 267267322597,
|
||||
"needed_description_check": false,
|
||||
"parse_engine": 0,
|
||||
"quantity": 5,
|
||||
"total_gigabytes": 1280
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
run5
|
||||
HTTP/1.1 200 OK
|
||||
content-length: 237
|
||||
content-type: application/json
|
||||
date: Thu, 10 Jul 2025 04:26:49 GMT
|
||||
And some jq usage for raw interaction of the data;
|
||||
```bash
|
||||
# Download a bunch of listings.
|
||||
http https://scraper.hak8or.com/api/listings since==0 limit==20 > listings.json
|
||||
|
||||
# Show what a single listing looks like.
|
||||
listings.json | jq '.[0]'
|
||||
{
|
||||
"listing": {
|
||||
"id": 22563,
|
||||
"item_id": 286707621236,
|
||||
"title": "WD_BLACK SN770M 2TB M.2 NVMe Internal SSD (WDBDNH0020BBK-WRSN)",
|
||||
"buy_it_now_price_cents": null,
|
||||
"has_best_offer": false,
|
||||
"id": 60,
|
||||
"image_url": "https://i.ebayimg.com/images/g/r8YAAOSwlkdoN5uW/s-l500.webp",
|
||||
"item_id": 267267322597,
|
||||
"title": "(Lot of 5) Used - Micro 1100 256GB SATA III 2.5\" SSD MTFDDAK256TBN"
|
||||
"image_url": "https://i.ebayimg.com/images/g/It4AAeSwzz5oddoa/s-l140.jpg"
|
||||
},
|
||||
"history": [
|
||||
{
|
||||
"item": 286707621236,
|
||||
"timestamp": "2025-07-15T04:46:54Z",
|
||||
"category": "ssd",
|
||||
"current_bid_usd_cents": 12900
|
||||
}
|
||||
],
|
||||
"parsed": [
|
||||
{
|
||||
"id": 6,
|
||||
"item": 286707621236,
|
||||
"total_gigabytes": 2048,
|
||||
"quantity": 1,
|
||||
"individual_size_gigabytes": 2048,
|
||||
"parse_engine": 0,
|
||||
"needed_description_check": false
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
|
||||
run6
|
||||
HTTP/1.1 200 OK
|
||||
content-length: 62
|
||||
content-type: application/json
|
||||
date: Thu, 10 Jul 2025 04:26:50 GMT
|
||||
|
||||
[
|
||||
{
|
||||
"current_bid_usd_cents": 1000,
|
||||
"when": "2025-06-19T21:44:23Z"
|
||||
# Show the 1st and 2nd items, but only grab a few specific entries.
|
||||
cat listings_small.json | jq '[.[1:3][] | {
|
||||
item_id: .listing.item_id,
|
||||
title: .listing.title,
|
||||
parsed: .parsed[] | {
|
||||
total_gigabytes,
|
||||
quantity,
|
||||
individual_size_gigabytes
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
run7
|
||||
HTTP/1.1 200 OK
|
||||
content-length: 149
|
||||
content-type: application/json
|
||||
date: Thu, 10 Jul 2025 04:26:50 GMT
|
||||
|
||||
}]'
|
||||
[
|
||||
{
|
||||
"id": 60,
|
||||
"individual_size_gigabytes": 256,
|
||||
"item": 267267322597,
|
||||
"needed_description_check": false,
|
||||
"parse_engine": 0,
|
||||
"quantity": 5,
|
||||
"total_gigabytes": 1280
|
||||
{
|
||||
"item_id": 297545995095,
|
||||
"title": "Crucial P3 Plus 1TB NVMe Internal M.2 SSD (CT1000P3PSSD8) - Barely used!",
|
||||
"parsed": {
|
||||
"total_gigabytes": 1024,
|
||||
"quantity": 1,
|
||||
"individual_size_gigabytes": 1024
|
||||
}
|
||||
},
|
||||
{
|
||||
"item_id": 127220979797,
|
||||
"title": "Kingston NV2 2TB M.2 3500MG/S NVMe Internal SSD PCIe 4.0 Gen SNV2S/2000G C-/#qWT",
|
||||
"parsed": {
|
||||
"total_gigabytes": 2048,
|
||||
"quantity": 1,
|
||||
"individual_size_gigabytes": 2048
|
||||
}
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
And now a LLM based parse, such that the prompt is this (189 Tokens for Gemini 2.5 Flash Lite)
|
||||
```
|
||||
I will provide you with a listing title I want you to analyse. Then you will tell me the total gigabytes of all drives listed in the listing, how many drives are specified in the title, and the gigabytes of each drive in the listing. Here is an example for a title of "Crucial P3 Plus 1TB NVMe Internal M.2 SSD (CT1000P3PSSD8) - Barely used!";
|
||||
```
|
||||
{
|
||||
"total_gigabytes": 1024,
|
||||
"quantity": 1,
|
||||
"individual_size_gigabytes": 1024
|
||||
}
|
||||
```
|
||||
Reply with "OK" (and _only_ "OK") if you understand this. After you reply with that, I will provide you with a title, and then you will reply with solely the requested json (and ONLY said json).
|
||||
```
|
||||
|
||||
And passing a title of (30 tokens);
|
||||
```
|
||||
Lot Of 3 Western Digital PC SN740 512GB M.2 2230 NVMe Internal SSD
|
||||
```
|
||||
returns the following json of (41 tokens);
|
||||
```json
|
||||
{
|
||||
"total_gigabytes": 1536,
|
||||
"quantity": 3,
|
||||
"individual_size_gigabytes": 512
|
||||
}
|
||||
```
|
||||
|
||||
and another example of sending (49 tokens)
|
||||
```
|
||||
(Lot of 6) Samsung MZ-VLB2560 256GB M.2 NVMe Internal SSD (MZVLB256HBHQ-000H1)
|
||||
```
|
||||
returns the following json of (42 tokens);
|
||||
```json
|
||||
{
|
||||
"total_gigabytes": 1536,
|
||||
"quantity": 6,
|
||||
"individual_size_gigabytes": 256
|
||||
}
|
||||
```
|
||||
|
||||
So for 1 listing we have a 189 Token "System Prompt", then a ~45 token title prompt, and 42 Token parsed reply. Given 30,000 listings, that's 5,670,000 Token "System Prompt" as Input, 1,350,000 Token Title prompt as Input, and 1,260,000 Token Parsed information (output). Assuming Gemini 2.5 Flash Mini which is $0.10/M for input and $0.40/M for output, would pay $0.702 for input and $0.504 for output, or $1.206 total.
|
||||
|
375
src/db.rs
375
src/db.rs
@@ -1,10 +1,11 @@
|
||||
use chrono::{DateTime, Utc};
|
||||
use num_enum::TryFromPrimitive;
|
||||
use rusqlite::Connection;
|
||||
use rusqlite::ToSql;
|
||||
use rusqlite::types::FromSql;
|
||||
use serde::Deserialize;
|
||||
use serde::Serialize;
|
||||
use std::path::Path;
|
||||
use strum::{EnumIter, IntoEnumIterator};
|
||||
// use strum_macros::EnumIter;
|
||||
use tracing::{error, info};
|
||||
|
||||
pub trait DBTable {
|
||||
@@ -142,54 +143,30 @@ impl SearchURL {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, PartialEq, Clone, EnumIter)]
|
||||
pub enum ParsedPageStatus {
|
||||
PendingParse,
|
||||
Ready,
|
||||
}
|
||||
impl TryFrom<i64> for ParsedPageStatus {
|
||||
type Error = rusqlite::Error;
|
||||
|
||||
fn try_from(value: i64) -> Result<Self, Self::Error> {
|
||||
match value {
|
||||
0 => Ok(ParsedPageStatus::PendingParse),
|
||||
1 => Ok(ParsedPageStatus::Ready),
|
||||
_ => Err(rusqlite::Error::InvalidColumnType(
|
||||
2,
|
||||
"Invalid integer of {} for ParsedPageStatus".to_string(),
|
||||
rusqlite::types::Type::Integer,
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, PartialEq, Clone)]
|
||||
pub struct Page {
|
||||
pub struct ParsedPage {
|
||||
pub timestamp: DateTime<Utc>,
|
||||
pub category: String,
|
||||
pub status: ParsedPageStatus,
|
||||
}
|
||||
impl DBTable for Page {
|
||||
const TABLE_NAME: &'static str = "Pages";
|
||||
impl DBTable for ParsedPage {
|
||||
const TABLE_NAME: &'static str = "Pages_Parsed";
|
||||
const TABLE_SCHEMA: &'static str = "
|
||||
id INTEGER PRIMARY KEY,
|
||||
category TEXT NOT NULL,
|
||||
timestamp INTEGER NOT NULL,
|
||||
status INTEGER NOT NULL,
|
||||
UNIQUE(category, timestamp)
|
||||
FOREIGN KEY(category) REFERENCES SearchURLs(name)
|
||||
";
|
||||
|
||||
fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> {
|
||||
let mut stmt = conn.prepare(&format!(
|
||||
"SELECT category, timestamp, status FROM {}",
|
||||
"SELECT category, timestamp FROM {}",
|
||||
Self::TABLE_NAME
|
||||
))?;
|
||||
let iter = stmt.query_map([], |row| {
|
||||
Ok(Page {
|
||||
Ok(ParsedPage {
|
||||
category: row.get(0)?,
|
||||
timestamp: row.get(1)?,
|
||||
status: row.get::<_, i64>(2)?.try_into().unwrap(),
|
||||
})
|
||||
})?;
|
||||
|
||||
@@ -200,7 +177,7 @@ impl DBTable for Page {
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
impl Page {
|
||||
impl ParsedPage {
|
||||
pub fn lookup(conn: &Connection, timestamp: DateTime<Utc>) -> Option<Self> {
|
||||
let mut stmt = conn
|
||||
.prepare(&format!(
|
||||
@@ -209,11 +186,10 @@ impl Page {
|
||||
))
|
||||
.ok()?;
|
||||
stmt.query_one([timestamp], |row| {
|
||||
Ok(Page {
|
||||
Ok(ParsedPage {
|
||||
// id: row.get(0)?,
|
||||
category: row.get(1)?,
|
||||
timestamp: row.get(2)?,
|
||||
status: row.get::<_, i64>(3)?.try_into().unwrap(),
|
||||
})
|
||||
})
|
||||
.ok()
|
||||
@@ -223,81 +199,43 @@ impl Page {
|
||||
let _ = conn
|
||||
.execute(
|
||||
&format!(
|
||||
"INSERT OR REPLACE INTO {} (category, timestamp, status) VALUES (?1, ?2, ?3)",
|
||||
"INSERT OR REPLACE INTO {} (category, timestamp) VALUES (?1, ?2)",
|
||||
Self::TABLE_NAME
|
||||
),
|
||||
(&self.category, self.timestamp, self.status.clone() as i64),
|
||||
(&self.category, self.timestamp),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn lookup_status(
|
||||
conn: &Connection,
|
||||
status: ParsedPageStatus,
|
||||
category: &str,
|
||||
max: usize,
|
||||
) -> Vec<Self> {
|
||||
let mut stmt = conn
|
||||
.prepare(&format!(
|
||||
"SELECT category, timestamp, status FROM {} WHERE status = {} AND category = ?1 LIMIT {}",
|
||||
Self::TABLE_NAME,
|
||||
status.clone() as i64,
|
||||
max
|
||||
))
|
||||
.unwrap();
|
||||
stmt.query_map([category], |row| {
|
||||
Ok(Self {
|
||||
category: row.get(0)?,
|
||||
timestamp: row.get(1)?,
|
||||
status: row.get::<_, i64>(2)?.try_into().unwrap(),
|
||||
})
|
||||
})
|
||||
.unwrap()
|
||||
.inspect(|e| info!("debugging saw {:?}", e))
|
||||
.filter_map(|e| e.ok())
|
||||
.collect()
|
||||
#[repr(i64)]
|
||||
#[derive(Serialize, Debug, PartialEq, Copy, Clone, PartialOrd, Ord, Eq, TryFromPrimitive)]
|
||||
pub enum StorageParsingEngineVersion {
|
||||
Testing = 0,
|
||||
Regex = 1,
|
||||
LLM = 2,
|
||||
}
|
||||
impl ToSql for StorageParsingEngineVersion {
|
||||
fn to_sql(&self) -> rusqlite::Result<rusqlite::types::ToSqlOutput<'_>> {
|
||||
Ok((*self as i64).into())
|
||||
}
|
||||
|
||||
pub fn category_stats(conn: &Connection, category: &str) -> Vec<(ParsedPageStatus, i64, i64)> {
|
||||
let mut res: Vec<(ParsedPageStatus, i64, i64)> = vec![];
|
||||
|
||||
for status in ParsedPageStatus::iter() {
|
||||
let cnt_category_status = conn
|
||||
.prepare(&format!(
|
||||
"SELECT COUNT(*) FROM {} WHERE category = ?1 AND status = {}",
|
||||
Self::TABLE_NAME,
|
||||
status.clone() as i64
|
||||
))
|
||||
.ok()
|
||||
.unwrap()
|
||||
.query_one([category], |r| r.get(0))
|
||||
.inspect_err(|e| error!("Failed to get count due to error\"{:?}\", returning 0", e))
|
||||
.unwrap_or(0);
|
||||
let cnt_category_total = conn
|
||||
.prepare(&format!(
|
||||
"SELECT COUNT(*) FROM {} WHERE category = ?1",
|
||||
Self::TABLE_NAME
|
||||
))
|
||||
.ok()
|
||||
.unwrap()
|
||||
.query_one([category], |r| r.get(0))
|
||||
.inspect_err(|e| error!("Failed to get count due to error\"{:?}\", returning 0", e))
|
||||
.unwrap_or(0);
|
||||
res.push((status, cnt_category_status, cnt_category_total));
|
||||
}
|
||||
res
|
||||
}
|
||||
impl FromSql for StorageParsingEngineVersion {
|
||||
fn column_result(value: rusqlite::types::ValueRef<'_>) -> rusqlite::types::FromSqlResult<Self> {
|
||||
let v = value.as_i64()?;
|
||||
Self::try_from(v).map_err(|_| rusqlite::types::FromSqlError::OutOfRange(v))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, PartialEq, Copy, Clone)]
|
||||
#[derive(Serialize, Debug, PartialEq, Clone)]
|
||||
pub struct ParsedStorage {
|
||||
pub id: i64,
|
||||
pub item: i64,
|
||||
pub total_gigabytes: i64,
|
||||
pub quantity: i64,
|
||||
pub individual_size_gigabytes: i64,
|
||||
pub parse_engine: i64,
|
||||
pub needed_description_check: bool,
|
||||
pub parse_engine: StorageParsingEngineVersion,
|
||||
pub failed_reason: String,
|
||||
}
|
||||
impl DBTable for ParsedStorage {
|
||||
const TABLE_NAME: &'static str = "Storage_Parsed";
|
||||
@@ -308,13 +246,13 @@ impl DBTable for ParsedStorage {
|
||||
quantity INTEGER,
|
||||
sizes_gigabytes TEXT,
|
||||
parse_engine INTEGER,
|
||||
need_description_check INTEGER,
|
||||
failed_reason TEXT,
|
||||
UNIQUE(item, parse_engine)
|
||||
FOREIGN KEY(item) REFERENCES Listings(item_id)
|
||||
";
|
||||
|
||||
fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> {
|
||||
let mut stmt = conn.prepare(&format!("SELECT id, item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, need_description_check FROM {}", Self::TABLE_NAME))?;
|
||||
let mut stmt = conn.prepare(&format!("SELECT id, item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, failed_reason FROM {}", Self::TABLE_NAME))?;
|
||||
let iter = stmt.query_map([], |row| {
|
||||
Ok(ParsedStorage {
|
||||
id: row.get(0)?,
|
||||
@@ -326,7 +264,7 @@ impl DBTable for ParsedStorage {
|
||||
r.parse().unwrap_or(0)
|
||||
},
|
||||
parse_engine: row.get(5)?,
|
||||
needed_description_check: row.get(6)?,
|
||||
failed_reason: row.get(6)?,
|
||||
})
|
||||
})?;
|
||||
|
||||
@@ -357,7 +295,7 @@ impl ParsedStorage {
|
||||
r.parse().unwrap()
|
||||
},
|
||||
parse_engine: row.get(5)?,
|
||||
needed_description_check: row.get(6)?,
|
||||
failed_reason: row.get(6)?,
|
||||
})
|
||||
})
|
||||
.ok()
|
||||
@@ -367,21 +305,26 @@ impl ParsedStorage {
|
||||
}
|
||||
|
||||
pub fn add_or_update(&self, conn: &Connection) {
|
||||
let _ = conn.execute(&format!("
|
||||
let _ = conn
|
||||
.execute(
|
||||
&format!(
|
||||
"
|
||||
INSERT OR REPLACE INTO {}
|
||||
(item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, need_description_check)
|
||||
(item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, failed_reason)
|
||||
VALUES
|
||||
(?1, ?2, ?3, ?4, ?5, ?6)",
|
||||
Self::TABLE_NAME),
|
||||
(
|
||||
&self.item,
|
||||
self.total_gigabytes,
|
||||
self.quantity,
|
||||
self.individual_size_gigabytes.to_string(),
|
||||
self.parse_engine,
|
||||
self.needed_description_check
|
||||
Self::TABLE_NAME
|
||||
),
|
||||
(
|
||||
&self.item,
|
||||
self.total_gigabytes,
|
||||
self.quantity,
|
||||
self.individual_size_gigabytes.to_string(),
|
||||
self.parse_engine,
|
||||
&self.failed_reason,
|
||||
),
|
||||
)
|
||||
).unwrap();
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -402,7 +345,7 @@ impl DBTable for ItemAppearances {
|
||||
current_bid_usd_cents INTEGER,
|
||||
UNIQUE(item, timestamp),
|
||||
FOREIGN KEY(item) REFERENCES Listings(item_id),
|
||||
FOREIGN KEY(category, timestamp) REFERENCES Pages(category, timestamp)
|
||||
FOREIGN KEY(category, timestamp) REFERENCES Pages_Parsed(category, timestamp)
|
||||
";
|
||||
|
||||
fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> {
|
||||
@@ -578,19 +521,40 @@ impl Listing {
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn lookup_non_parsed(conn: &Connection) -> Vec<(i64, String)> {
|
||||
let mut stmt = conn
|
||||
.prepare(&format!(
|
||||
"
|
||||
SELECT ei.item_id, ei.title FROM {} AS ei
|
||||
LEFT JOIN {} AS sp ON ei.item_id = sp.item
|
||||
WHERE sp.item IS NULL",
|
||||
Self::TABLE_NAME,
|
||||
ParsedStorage::TABLE_NAME
|
||||
))
|
||||
pub fn lookup_pending_parse(
|
||||
conn: &Connection,
|
||||
allowed_engines: &[i64],
|
||||
count_limit: u64,
|
||||
) -> Vec<(i64, String)> {
|
||||
let engines_filter = if !allowed_engines.is_empty() {
|
||||
format!(
|
||||
"AND ({})",
|
||||
allowed_engines
|
||||
.iter()
|
||||
.map(|e| "ps.parse_engine = ".to_owned() + &e.to_string())
|
||||
.collect::<Vec<_>>()
|
||||
.join(" OR ")
|
||||
)
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
|
||||
let query = format!(
|
||||
"
|
||||
SELECT listing.item_id, listing.title FROM {0} AS listing
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM {1} AS ps
|
||||
WHERE listing.item_id = ps.item {engines_filter}
|
||||
)
|
||||
LIMIT {count_limit}
|
||||
",
|
||||
Self::TABLE_NAME,
|
||||
ParsedStorage::TABLE_NAME
|
||||
);
|
||||
conn.prepare(&query)
|
||||
.ok()
|
||||
.unwrap();
|
||||
stmt.query_map([], |row| Ok((row.get(0)?, row.get(1)?)))
|
||||
.unwrap()
|
||||
.query_map([], |row| Ok((row.get(0)?, row.get(1)?)))
|
||||
.ok()
|
||||
.unwrap()
|
||||
.map(|e| e.unwrap())
|
||||
@@ -649,7 +613,7 @@ pub fn listings_get_filtered(
|
||||
history: ItemAppearances::lookup(conn, l.item_id),
|
||||
parsed: ParsedStorage::lookup(conn, l.item_id),
|
||||
})
|
||||
.filter(|lr| lr.parsed.iter().any(|p| !p.needed_description_check))
|
||||
.filter(|lr| lr.parsed.iter().any(|p| p.failed_reason.is_empty()))
|
||||
.collect::<Vec<ListingsFilterResult>>();
|
||||
info!(
|
||||
"Found total {} listings since (str:{} epoch:{})",
|
||||
@@ -698,6 +662,125 @@ pub fn listings_get_filtered(
|
||||
listings
|
||||
}
|
||||
|
||||
#[repr(i64)]
|
||||
#[derive(Serialize, Debug, PartialEq, Copy, Clone, TryFromPrimitive)]
|
||||
pub enum StorageLLMVersion {
|
||||
Testing = 0,
|
||||
Gemini2d5Prompt0 = 1,
|
||||
}
|
||||
impl ToSql for StorageLLMVersion {
|
||||
fn to_sql(&self) -> rusqlite::Result<rusqlite::types::ToSqlOutput<'_>> {
|
||||
Ok((*self as i64).into())
|
||||
}
|
||||
}
|
||||
impl FromSql for StorageLLMVersion {
|
||||
fn column_result(value: rusqlite::types::ValueRef<'_>) -> rusqlite::types::FromSqlResult<Self> {
|
||||
let v = value.as_i64()?;
|
||||
Self::try_from(v).map_err(|_| rusqlite::types::FromSqlError::OutOfRange(v))
|
||||
}
|
||||
}
|
||||
|
||||
// This is mostly meant as a way to cache all of these.
|
||||
#[derive(Serialize, Debug, PartialEq, Clone)]
|
||||
pub struct ParsedLLMStorageResult {
|
||||
pub id: i64,
|
||||
pub item_id: i64,
|
||||
pub title: String,
|
||||
pub quantity: i64,
|
||||
pub gigabytes: i64,
|
||||
pub fail_reason: String,
|
||||
pub llm_id: StorageLLMVersion,
|
||||
}
|
||||
impl DBTable for ParsedLLMStorageResult {
|
||||
const TABLE_NAME: &'static str = "ParsedLLMStorageResult";
|
||||
const TABLE_SCHEMA: &'static str = "
|
||||
id INTEGER PRIMARY KEY,
|
||||
item_id INTEGER NOT NULL UNIQUE,
|
||||
title TEXT NOT NULL,
|
||||
quantity INTEGER NOT NULL,
|
||||
gigabytes INTEGER NOT NULL,
|
||||
fail_reason TEXT NOT NULL,
|
||||
llm_id INTEGER NOT NULL
|
||||
";
|
||||
|
||||
fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> {
|
||||
let mut stmt = conn.prepare(&format!(
|
||||
"SELECT id, item_id, title, quantity, gigabytes, fail_reason, llm_id FROM {}",
|
||||
Self::TABLE_NAME
|
||||
))?;
|
||||
let iter = stmt.query_map([], |row| {
|
||||
Ok(ParsedLLMStorageResult {
|
||||
id: row.get(0)?,
|
||||
item_id: row.get(1)?,
|
||||
title: row.get(2)?,
|
||||
quantity: row.get(3)?,
|
||||
gigabytes: row.get(4)?,
|
||||
fail_reason: row.get(5)?,
|
||||
llm_id: row.get(6)?,
|
||||
})
|
||||
})?;
|
||||
|
||||
let mut result = Vec::new();
|
||||
for item in iter {
|
||||
result.push(item?);
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
impl ParsedLLMStorageResult {
|
||||
pub fn lookup(conn: &Connection, item_id: i64) -> Option<ParsedLLMStorageResult> {
|
||||
let mut stmt = conn
|
||||
.prepare(&format!(
|
||||
"SELECT * FROM {} WHERE item_id = ?",
|
||||
Self::TABLE_NAME
|
||||
))
|
||||
.ok()?;
|
||||
stmt.query_one([item_id], |row| {
|
||||
Ok(ParsedLLMStorageResult {
|
||||
id: row.get(0)?,
|
||||
item_id: row.get(1)?,
|
||||
title: row.get(2)?,
|
||||
quantity: row.get(3)?,
|
||||
gigabytes: row.get(4)?,
|
||||
fail_reason: row.get(5)?,
|
||||
llm_id: row.get(6)?,
|
||||
})
|
||||
})
|
||||
.ok()
|
||||
}
|
||||
|
||||
pub fn add_or_update(&self, conn: &Connection) {
|
||||
let count = conn
|
||||
.execute(
|
||||
&format!(
|
||||
"INSERT OR REPLACE INTO {}
|
||||
(
|
||||
item_id,
|
||||
title,
|
||||
quantity,
|
||||
gigabytes,
|
||||
fail_reason,
|
||||
llm_id
|
||||
)
|
||||
VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
|
||||
Self::TABLE_NAME
|
||||
),
|
||||
(
|
||||
self.item_id,
|
||||
&self.title,
|
||||
self.quantity,
|
||||
self.gigabytes,
|
||||
self.fail_reason.clone(),
|
||||
self.llm_id,
|
||||
),
|
||||
)
|
||||
.unwrap();
|
||||
if count != 1 {
|
||||
panic!("Expected count to be 1 but got {}", count);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_initialized(path: Option<&Path>) -> Connection {
|
||||
let conn = match path {
|
||||
Some(p) => Connection::open(&p),
|
||||
@@ -708,8 +791,9 @@ pub fn get_initialized(path: Option<&Path>) -> Connection {
|
||||
SearchURL::initialize(&conn);
|
||||
Listing::initialize(&conn);
|
||||
ParsedStorage::initialize(&conn);
|
||||
Page::initialize(&conn);
|
||||
ParsedPage::initialize(&conn);
|
||||
ItemAppearances::initialize(&conn);
|
||||
ParsedLLMStorageResult::initialize(&conn);
|
||||
|
||||
conn
|
||||
}
|
||||
@@ -721,6 +805,7 @@ pub struct Stats {
|
||||
rows_parsed_storage: i64,
|
||||
rows_parsed_page: i64,
|
||||
rows_item_appearances: i64,
|
||||
pub rows_parsed_storage_llm: i64,
|
||||
}
|
||||
|
||||
pub fn get_stats(conn: &Connection) -> Stats {
|
||||
@@ -728,8 +813,9 @@ pub fn get_stats(conn: &Connection) -> Stats {
|
||||
rows_search_url: SearchURL::get_count(conn),
|
||||
rows_listing: Listing::get_count(conn),
|
||||
rows_parsed_storage: ParsedStorage::get_count(conn),
|
||||
rows_parsed_page: Page::get_count(conn),
|
||||
rows_parsed_page: ParsedPage::get_count(conn),
|
||||
rows_item_appearances: ItemAppearances::get_count(conn),
|
||||
rows_parsed_storage_llm: ParsedLLMStorageResult::get_count(conn),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -751,7 +837,7 @@ mod tests {
|
||||
let listing = Listing {
|
||||
id: 1,
|
||||
item_id: 1234,
|
||||
title: "Some Title".to_string(),
|
||||
title: "Lot of 2 512GB SSD 6gb/s working with 5% wear".to_string(),
|
||||
buy_it_now_price_cents: Some(123),
|
||||
has_best_offer: false,
|
||||
image_url: "google.com".to_string(),
|
||||
@@ -765,34 +851,18 @@ mod tests {
|
||||
total_gigabytes: 13,
|
||||
quantity: 3,
|
||||
individual_size_gigabytes: 13,
|
||||
parse_engine: 9,
|
||||
needed_description_check: true,
|
||||
parse_engine: StorageParsingEngineVersion::Testing,
|
||||
failed_reason: "".to_owned(),
|
||||
};
|
||||
parsed.add_or_update(&db);
|
||||
assert_eq!(ParsedStorage::lookup(&db, listing.item_id), vec![parsed]);
|
||||
|
||||
let page = Page {
|
||||
let page = ParsedPage {
|
||||
category: "ssd".to_owned(),
|
||||
timestamp: std::time::SystemTime::now().into(),
|
||||
status: ParsedPageStatus::PendingParse,
|
||||
};
|
||||
page.add_or_update(&db);
|
||||
assert_eq!(Page::lookup(&db, page.timestamp), Some(page.clone()));
|
||||
assert_eq!(
|
||||
Page::lookup_status(&db, ParsedPageStatus::PendingParse, "ssd", 10),
|
||||
vec![page.clone()]
|
||||
);
|
||||
assert_eq!(
|
||||
Page::lookup_status(&db, ParsedPageStatus::Ready, "ssd", 10),
|
||||
vec![]
|
||||
);
|
||||
assert_eq!(
|
||||
Page::category_stats(&db, "ssd"),
|
||||
vec![
|
||||
(ParsedPageStatus::PendingParse, 1, 1),
|
||||
(ParsedPageStatus::Ready, 0, 1)
|
||||
]
|
||||
);
|
||||
assert_eq!(ParsedPage::lookup(&db, page.timestamp), Some(page.clone()));
|
||||
|
||||
let apperance = ItemAppearances {
|
||||
item: listing.item_id,
|
||||
@@ -806,6 +876,21 @@ mod tests {
|
||||
vec![apperance]
|
||||
);
|
||||
|
||||
let parsedllmstorage = ParsedLLMStorageResult {
|
||||
fail_reason: "Some reason".to_owned(),
|
||||
gigabytes: 12,
|
||||
id: 1,
|
||||
item_id: 12345,
|
||||
quantity: 32,
|
||||
title: "Some Title".to_owned(),
|
||||
llm_id: StorageLLMVersion::Testing,
|
||||
};
|
||||
parsedllmstorage.add_or_update(&db);
|
||||
assert_eq!(
|
||||
ParsedLLMStorageResult::lookup(&db, parsedllmstorage.item_id),
|
||||
Some(parsedllmstorage)
|
||||
);
|
||||
|
||||
assert_eq!(Listing::lookup_since(&db, page.timestamp, 3), vec![listing]);
|
||||
assert_eq!(
|
||||
Listing::lookup_since(&db, page.timestamp + chrono::Duration::seconds(1), 3),
|
||||
|
@@ -1,5 +1,5 @@
|
||||
pub mod db;
|
||||
pub mod parser;
|
||||
pub mod parser_ebay;
|
||||
pub mod parser_storage;
|
||||
pub mod xdg_dirs;
|
||||
pub mod parser_storage_e0;
|
||||
pub mod parser_storage_e1;
|
||||
|
243
src/main.rs
243
src/main.rs
@@ -1,25 +1,24 @@
|
||||
use actix_web::{App, HttpServer, Responder, Result, get, post, rt, web, web::Data};
|
||||
use actix_web::{App, HttpServer, Responder, Result, get, post, web, web::Data};
|
||||
use chrono::{DateTime, Utc};
|
||||
use clap::Parser;
|
||||
use ebay_scraper_rust::db;
|
||||
use ebay_scraper_rust::db::DBTable;
|
||||
use ebay_scraper_rust::db::Page;
|
||||
use ebay_scraper_rust::parser;
|
||||
use ebay_scraper_rust::parser_storage;
|
||||
use ebay_scraper_rust::xdg_dirs;
|
||||
// use rt::mpsc;
|
||||
// use rt::time::timeout;
|
||||
use ebay_scraper_rust::parser::parse_dir;
|
||||
use ebay_scraper_rust::{parser_storage_e0, parser_storage_e1};
|
||||
use futures::future::join_all;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Mutex;
|
||||
use std::time::Instant;
|
||||
use tracing::{info, instrument};
|
||||
use tracing::{error, info, instrument};
|
||||
|
||||
use tracing_subscriber::filter::EnvFilter;
|
||||
use tracing_subscriber::fmt;
|
||||
use tracing_subscriber::prelude::__tracing_subscriber_SubscriberExt;
|
||||
use tracing_subscriber::util::SubscriberInitExt;
|
||||
|
||||
mod xdg_dirs;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[clap(
|
||||
name = "ebay-scraper-rust",
|
||||
@@ -28,6 +27,13 @@ use tracing_subscriber::util::SubscriberInitExt;
|
||||
)]
|
||||
struct Args {}
|
||||
|
||||
struct AppCtx {
|
||||
db: rusqlite::Connection,
|
||||
db_llm: rusqlite::Connection,
|
||||
download_dir: PathBuf,
|
||||
llm_parser: Option<actix_web::rt::task::JoinHandle<()>>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug)]
|
||||
struct ListingsFilter {
|
||||
since: Option<i64>,
|
||||
@@ -37,12 +43,12 @@ struct ListingsFilter {
|
||||
|
||||
#[get("/listings")]
|
||||
async fn listings_filtered_get(
|
||||
db: Data<Mutex<rusqlite::Connection>>,
|
||||
ctx: Data<Mutex<AppCtx>>,
|
||||
filter: web::Query<ListingsFilter>,
|
||||
) -> Result<impl Responder> {
|
||||
let start = Instant::now();
|
||||
let res = db::listings_get_filtered(
|
||||
&db.lock().unwrap(),
|
||||
&ctx.lock().unwrap().db,
|
||||
&DateTime::<Utc>::from_timestamp(filter.since.unwrap_or(0), 0).unwrap(),
|
||||
filter.limit.unwrap_or(1_000),
|
||||
filter.cents_per_tbytes_max.unwrap_or(100_00),
|
||||
@@ -57,20 +63,14 @@ async fn listings_filtered_get(
|
||||
}
|
||||
|
||||
#[get("/listing/{id}")]
|
||||
async fn listing_get(
|
||||
db: Data<Mutex<rusqlite::Connection>>,
|
||||
id: web::Path<i64>,
|
||||
) -> Result<impl Responder> {
|
||||
Ok(web::Json(db::Listing::lookup(&db.lock().unwrap(), *id)))
|
||||
async fn listing_get(ctx: Data<Mutex<AppCtx>>, id: web::Path<i64>) -> Result<impl Responder> {
|
||||
Ok(web::Json(db::Listing::lookup(&ctx.lock().unwrap().db, *id)))
|
||||
}
|
||||
|
||||
#[get("/listing/{id}/parsed")]
|
||||
async fn listing_parse_get(
|
||||
db: Data<Mutex<rusqlite::Connection>>,
|
||||
id: web::Path<i64>,
|
||||
) -> Result<impl Responder> {
|
||||
async fn listing_parse_get(ctx: Data<Mutex<AppCtx>>, id: web::Path<i64>) -> Result<impl Responder> {
|
||||
Ok(web::Json(db::ParsedStorage::lookup(
|
||||
&db.lock().unwrap(),
|
||||
&ctx.lock().unwrap().db,
|
||||
*id,
|
||||
)))
|
||||
}
|
||||
@@ -83,10 +83,10 @@ struct APIHistory {
|
||||
|
||||
#[get("/listing/{id}/history")]
|
||||
async fn listing_history_get(
|
||||
db: Data<Mutex<rusqlite::Connection>>,
|
||||
ctx: Data<Mutex<AppCtx>>,
|
||||
id: web::Path<i64>,
|
||||
) -> Result<impl Responder> {
|
||||
let history: Vec<_> = db::ItemAppearances::lookup(&db.lock().unwrap(), *id)
|
||||
let history: Vec<_> = db::ItemAppearances::lookup(&ctx.lock().unwrap().db, *id)
|
||||
.iter()
|
||||
// .inspect(|e| info!("got: {:?}", e))
|
||||
.filter_map(|e| {
|
||||
@@ -99,91 +99,134 @@ async fn listing_history_get(
|
||||
Ok(web::Json(history))
|
||||
}
|
||||
|
||||
#[post("/listing/parse")]
|
||||
async fn parse_listings(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
|
||||
let mut cnt = 0;
|
||||
let db_unlocked = db.lock().unwrap();
|
||||
db::Listing::lookup_non_parsed(&db_unlocked)
|
||||
async fn storage_parse_work(entries: &[(i64, String)]) -> Vec<db::ParsedStorage> {
|
||||
let llm_futures: Vec<_> = entries
|
||||
.iter()
|
||||
.map(|l| parser_storage::parse_size_and_quantity(l.0, &l.1))
|
||||
.inspect(|_| cnt = cnt + 1)
|
||||
.for_each(|ps| ps.add_or_update(&db_unlocked));
|
||||
.map(|(id, title)| parser_storage_e1::parse_size_and_quantity_llm(*id, title))
|
||||
.collect();
|
||||
let llm_future_results = join_all(llm_futures).await;
|
||||
let llm_results = llm_future_results
|
||||
.iter()
|
||||
.flatten()
|
||||
.map(|e| db::ParsedStorage {
|
||||
id: 0,
|
||||
item: e.item_id,
|
||||
total_gigabytes: e.quantity * e.gigabytes,
|
||||
quantity: e.quantity,
|
||||
individual_size_gigabytes: e.gigabytes,
|
||||
failed_reason: e.fail_reason.clone(),
|
||||
parse_engine: db::StorageParsingEngineVersion::LLM,
|
||||
});
|
||||
// .inspect(|e| e.add_or_update(&unlocked.db))
|
||||
// .map(|e| db::ParsedStorage {
|
||||
// id: 0,
|
||||
// item: e.item_id,
|
||||
// total_gigabytes: e.quantity * e.gigabytes,
|
||||
// quantity: e.quantity,
|
||||
// individual_size_gigabytes: e.gigabytes,
|
||||
// needed_description_check: !e.fail_reason.is_empty(),
|
||||
// parse_engine: db::StorageParsingEngineVersion::LLM,
|
||||
// })
|
||||
// .for_each(|e| e.add_or_update(&unlocked.db));
|
||||
|
||||
Ok(web::Json(cnt))
|
||||
// And a regex based parse.
|
||||
let regex_results = entries
|
||||
.iter()
|
||||
.map(|(id, title)| parser_storage_e0::parse_size_and_quantity(*id, &title));
|
||||
// .for_each(|e| e.add_or_update(&unlocked.db));
|
||||
|
||||
regex_results.chain(llm_results).collect()
|
||||
}
|
||||
|
||||
fn storage_parse_worker(ctx: Data<Mutex<AppCtx>>) -> actix_web::rt::task::JoinHandle<()> {
|
||||
actix_web::rt::spawn(async move {
|
||||
loop {
|
||||
actix_web::rt::time::sleep(std::time::Duration::from_millis(1000)).await;
|
||||
let ctx_unlocked = ctx.lock().unwrap();
|
||||
let entries = db::Listing::lookup_pending_parse(&ctx_unlocked.db, &[], 10);
|
||||
let parsed = storage_parse_work(entries.as_slice()).await;
|
||||
for p in parsed {
|
||||
p.add_or_update(&ctx_unlocked.db);
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
#[post("/listing/parse")]
|
||||
async fn parse_listings(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
|
||||
// Prepare a background parser to go through and use an LLM to parse the
|
||||
// storage info.
|
||||
if ctx.lock().unwrap().llm_parser.is_none() {
|
||||
ctx.clone().lock().unwrap().llm_parser = Some(storage_parse_worker(ctx.clone()));
|
||||
}
|
||||
|
||||
// Lets grab a few entries and then try parsing them with two engines.
|
||||
let ctx_locked = ctx.lock().unwrap();
|
||||
let entries: Vec<_> = db::Listing::lookup_pending_parse(&ctx_locked.db, &[], 100)
|
||||
.iter()
|
||||
.take(10)
|
||||
.map(|e| e.clone())
|
||||
.collect();
|
||||
for (item_id, title) in &entries {
|
||||
let ps1 =
|
||||
parser_storage_e1::parse_size_and_quantity(&ctx_locked.db_llm, *item_id, &title).await;
|
||||
if ps1.is_some() {
|
||||
// info!(
|
||||
// "Parsed using an LLM title:{} and results:{:?}",
|
||||
// title,
|
||||
// ps1.unwrap()
|
||||
// );
|
||||
ps1.unwrap().add_or_update(&ctx_locked.db);
|
||||
// ps1.unwrap().add_or_update(&ctx_locked.db_llm); No need
|
||||
} else {
|
||||
error!("Failed to parse {item_id} with title {title}");
|
||||
}
|
||||
}
|
||||
Ok(web::Json(entries.len()))
|
||||
}
|
||||
|
||||
#[get("/category")]
|
||||
async fn category_getnames(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
|
||||
Ok(web::Json(db::SearchURL::names(&db.lock().unwrap())))
|
||||
}
|
||||
|
||||
async fn category_discover_worker(
|
||||
db: Data<Mutex<rusqlite::Connection>>,
|
||||
downloaddir: Data<PathBuf>,
|
||||
category: web::Path<String>,
|
||||
) {
|
||||
}
|
||||
|
||||
#[post("/category/{category}/discover")]
|
||||
#[instrument(skip_all)]
|
||||
async fn category_discover(
|
||||
db: Data<Mutex<rusqlite::Connection>>,
|
||||
downloaddir: Data<PathBuf>,
|
||||
category: web::Path<String>,
|
||||
) -> Result<impl Responder> {
|
||||
let start = Instant::now();
|
||||
parser::add_pages(
|
||||
&db.lock().unwrap(),
|
||||
&downloaddir.join(category.clone()),
|
||||
&category,
|
||||
);
|
||||
let elapsed = start.elapsed().as_micros() as f64 / 1000.0;
|
||||
|
||||
info!("Added many pages to the category, took {elapsed} ms.");
|
||||
Ok("")
|
||||
async fn category_getnames(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
|
||||
Ok(web::Json(db::SearchURL::names(&ctx.lock().unwrap().db)))
|
||||
}
|
||||
|
||||
#[post("/category/{category}/parse")]
|
||||
#[instrument(skip_all)]
|
||||
async fn category_parse(
|
||||
db: Data<Mutex<rusqlite::Connection>>,
|
||||
downloaddir: Data<PathBuf>,
|
||||
category: web::Path<String>,
|
||||
) -> Result<impl Responder> {
|
||||
parser::parse_pages(&db.lock().unwrap(), &downloaddir, &category, 100);
|
||||
Ok("")
|
||||
}
|
||||
|
||||
#[get("/category/{category}/parse")]
|
||||
#[instrument(skip_all)]
|
||||
async fn category_parse_get(
|
||||
db: Data<Mutex<rusqlite::Connection>>,
|
||||
ctx: Data<Mutex<AppCtx>>,
|
||||
category: web::Path<String>,
|
||||
) -> Result<impl Responder> {
|
||||
let start = Instant::now();
|
||||
let stats = Page::category_stats(&db.lock().unwrap(), &category);
|
||||
stats
|
||||
.iter()
|
||||
.for_each(|(status, cnt, total)| info!("{:?} {} {}", status, cnt, total));
|
||||
let ctx_unlocked = ctx.lock().unwrap();
|
||||
let count = parse_dir(
|
||||
&ctx_unlocked.download_dir.join(category.clone()),
|
||||
&category,
|
||||
&ctx_unlocked.db,
|
||||
)
|
||||
.unwrap();
|
||||
let elapsed = start.elapsed().as_micros() as f64 / 1000.0;
|
||||
info!("Found, took {elapsed} ms.");
|
||||
Ok(web::Json(stats))
|
||||
|
||||
info!("Added {count} listings, took {elapsed} ms.");
|
||||
Ok(count.to_string())
|
||||
}
|
||||
|
||||
#[get("/stats")]
|
||||
async fn stats_get(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
|
||||
Ok(web::Json(db::get_stats(&db.lock().unwrap())))
|
||||
async fn stats_get(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
|
||||
let mut stats_db = db::get_stats(&ctx.lock().unwrap().db);
|
||||
let stats_db_llm = db::get_stats(&ctx.lock().unwrap().db_llm);
|
||||
stats_db.rows_parsed_storage_llm = stats_db_llm.rows_parsed_storage_llm;
|
||||
Ok(web::Json(stats_db))
|
||||
}
|
||||
|
||||
#[get("/admin")]
|
||||
async fn admin_get(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
|
||||
let db = db.lock().unwrap();
|
||||
async fn admin_get(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
|
||||
let ctx_locked = ctx.lock().unwrap();
|
||||
let query_start_time = Instant::now();
|
||||
let search_urls = db::SearchURL::get_all(&db).unwrap_or_default();
|
||||
let parsed_pages = db::Page::get_all(&db).unwrap_or_default();
|
||||
let parsed_storages = db::ParsedStorage::get_all(&db).unwrap_or_default();
|
||||
let item_appearances = db::ItemAppearances::get_all(&db).unwrap_or_default();
|
||||
let listings = db::Listing::get_all(&db).unwrap_or_default();
|
||||
let search_urls = db::SearchURL::get_all(&ctx_locked.db).unwrap_or_default();
|
||||
let parsed_pages = db::ParsedPage::get_all(&ctx_locked.db).unwrap_or_default();
|
||||
let parsed_storages = db::ParsedStorage::get_all(&ctx_locked.db).unwrap_or_default();
|
||||
let item_appearances = db::ItemAppearances::get_all(&ctx_locked.db).unwrap_or_default();
|
||||
let listings = db::Listing::get_all(&ctx_locked.db).unwrap_or_default();
|
||||
let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
|
||||
|
||||
let html_gen_start_time = Instant::now();
|
||||
@@ -310,10 +353,6 @@ fn generate_table<T: Serialize>(title: &str, data: &[T]) -> String {
|
||||
)
|
||||
}
|
||||
|
||||
async fn pages_pickup() -> std::io::Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[actix_web::main]
|
||||
async fn main() -> std::io::Result<()> {
|
||||
tracing_subscriber::registry()
|
||||
@@ -327,10 +366,22 @@ async fn main() -> std::io::Result<()> {
|
||||
"Starting with scraped data dir of \"{}\".",
|
||||
scrapedatadir.to_str().unwrap()
|
||||
);
|
||||
let db_mutex = Data::new(Mutex::new(db::get_initialized(None)));
|
||||
|
||||
let app_data = Data::new(Mutex::new(AppCtx {
|
||||
download_dir: scrapedatadir.clone(),
|
||||
db: db::get_initialized(None),
|
||||
db_llm: {
|
||||
let db_path = scrapedatadir.with_file_name("llm.sqlite");
|
||||
let db = rusqlite::Connection::open(&db_path).unwrap();
|
||||
db::ParsedLLMStorageResult::initialize(&db);
|
||||
info!("Created {:?} for caching LLM parsed title.", db_path);
|
||||
db
|
||||
},
|
||||
llm_parser: None,
|
||||
}));
|
||||
|
||||
// Prepare our backend via pulling in what catagories we are preconfigured with.
|
||||
db::SearchURL::scan(&db_mutex.lock().unwrap(), &scrapedatadir, "url.json");
|
||||
db::SearchURL::scan(&app_data.lock().unwrap().db, &scrapedatadir, "url.json");
|
||||
|
||||
HttpServer::new(move || {
|
||||
App::new()
|
||||
@@ -338,23 +389,17 @@ async fn main() -> std::io::Result<()> {
|
||||
.service(listing_get)
|
||||
.service(listings_filtered_get)
|
||||
.service(listing_history_get)
|
||||
.service(listing_parse_get)
|
||||
// Category handlers
|
||||
.service(parse_listings)
|
||||
.service(category_parse)
|
||||
.service(category_discover)
|
||||
.service(category_getnames)
|
||||
.service(category_parse_get)
|
||||
// Gnarly info dump
|
||||
.service(admin_get)
|
||||
.service(stats_get)
|
||||
// Stuff which is passed into every request.
|
||||
.app_data(db_mutex.clone())
|
||||
.app_data(Data::new(scrapedatadir.clone()))
|
||||
.app_data(app_data.clone())
|
||||
})
|
||||
.bind(("0.0.0.0", 9876))?
|
||||
.run()
|
||||
.await
|
||||
|
||||
// tokio::join!(server, pages_pickup)
|
||||
}
|
||||
|
142
src/parser.rs
142
src/parser.rs
@@ -1,13 +1,14 @@
|
||||
use crate::{
|
||||
db::{Page, ParsedPageStatus, SearchURL},
|
||||
db::{ParsedPage, SearchURL},
|
||||
parser_ebay,
|
||||
};
|
||||
use rayon::prelude::*;
|
||||
use serde::Deserialize;
|
||||
use serde_json;
|
||||
use std::path::Path;
|
||||
use std::time::Instant;
|
||||
use std::{io::Read, path::Path};
|
||||
use tracing::{debug, error, info};
|
||||
use zstd;
|
||||
|
||||
fn timestamps_from_dir(path: &Path) -> Vec<i64> {
|
||||
if !std::fs::exists(path).expect("Directory must exist") {
|
||||
@@ -20,23 +21,52 @@ fn timestamps_from_dir(path: &Path) -> Vec<i64> {
|
||||
std::fs::read_dir(path)
|
||||
.unwrap()
|
||||
.map(|fpath| fpath.unwrap().path())
|
||||
.filter_map(|fstem| {
|
||||
fstem
|
||||
.file_stem()
|
||||
.and_then(|s| s.to_str())
|
||||
.expect("Invalid file name")
|
||||
.parse()
|
||||
.ok()
|
||||
.filter_map(|fname| {
|
||||
// Turns out file_stem() doesn't handle multiple extensions and
|
||||
// file_prefix() is still in not stable.
|
||||
Some(fname.file_stem()?.to_str()?.split_once('.')?.0.to_owned())
|
||||
})
|
||||
.filter_map(|fname| fname.parse().ok())
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn add_pages(db: &rusqlite::Connection, dir: &Path, category: &str) {
|
||||
fn read_timestamp_from_dir(
|
||||
dir: &Path,
|
||||
timestamp: &chrono::DateTime<chrono::Utc>,
|
||||
) -> Option<String> {
|
||||
// First check for the normal html version, which we can just read straight.
|
||||
let page_path = dir.join(format!("{}.{}", timestamp.timestamp(), "html"));
|
||||
if page_path.exists() {
|
||||
return std::fs::read_to_string(&page_path)
|
||||
.inspect_err(|e| error!("Failed reading {}, error:{e}", page_path.display()))
|
||||
.ok();
|
||||
}
|
||||
|
||||
// And now if it's compresed but with zstd.
|
||||
let page_path = dir.join(format!("{}.{}.{}", timestamp.timestamp(), "html", "zst"));
|
||||
if page_path.exists() {
|
||||
let f = std::fs::File::open(&page_path)
|
||||
.inspect_err(|e| error!("Failed reading {}, error:{e}", page_path.display()))
|
||||
.ok()?;
|
||||
let mut s = String::new();
|
||||
zstd::Decoder::new(f).ok()?.read_to_string(&mut s).ok()?;
|
||||
return Some(s);
|
||||
}
|
||||
|
||||
error!(
|
||||
"Failed to lookup file for timestamp {} in {}, bailing ...",
|
||||
timestamp,
|
||||
dir.display()
|
||||
);
|
||||
None
|
||||
}
|
||||
|
||||
pub fn parse_dir(dir: &Path, category: &str, db: &rusqlite::Connection) -> Option<usize> {
|
||||
// Ensure the category is created.
|
||||
let url_fpath = dir.join("url.json");
|
||||
let url_contents = std::fs::read_to_string(&url_fpath)
|
||||
.inspect_err(|e| error!("Failed reading {}: {e}", url_fpath.display()))
|
||||
.unwrap();
|
||||
.ok()?;
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct URLJSON {
|
||||
@@ -50,75 +80,59 @@ pub fn add_pages(db: &rusqlite::Connection, dir: &Path, category: &str) {
|
||||
|
||||
// See all pages haven't been seen before.
|
||||
let query_start_time = Instant::now();
|
||||
let to_parse = timestamps_from_dir(dir)
|
||||
.into_iter()
|
||||
.filter(|t| {
|
||||
let ts = chrono::DateTime::from_timestamp(*t, 0).unwrap();
|
||||
let p = Page::lookup(&db, ts);
|
||||
|
||||
// Timestamp never seen before, lets pass it on.
|
||||
if p.is_none() {
|
||||
info!(
|
||||
"Page Timestamp:{} Catagory:{category} never seen before",
|
||||
ts.timestamp()
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Timestamp was seen before *and* from the same catagory, don't pass it on.
|
||||
if p.unwrap().category == *category {
|
||||
info!(
|
||||
"Page Timestamp:{} Catagory:{category} seen before, skipping",
|
||||
ts.timestamp()
|
||||
);
|
||||
return false;
|
||||
}
|
||||
let to_parse = timestamps_from_dir(dir).into_iter().filter(|t| {
|
||||
let ts = chrono::DateTime::from_timestamp(*t, 0).unwrap();
|
||||
let p = ParsedPage::lookup(&db, ts);
|
||||
|
||||
// Timestamp never seen before, lets pass it on.
|
||||
if p.is_none() {
|
||||
info!(
|
||||
"Page Timestamp:{} Catagory:{category} seen before, but not of catagory:{category}",
|
||||
"Page Timestamp:{} Catagory:{category} never seen before, processing ...",
|
||||
ts.timestamp()
|
||||
);
|
||||
return true;
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
}
|
||||
|
||||
// Timestamp was seen before *and* from the same catagory, don't pass
|
||||
// it on.
|
||||
if p.unwrap().category == *category {
|
||||
info!(
|
||||
"Page Timestamp:{} Catagory:{category} seen before, skipping ...",
|
||||
ts.timestamp()
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
||||
info!(
|
||||
"Page Timestamp:{} Catagory:{category} seen before, but not of catagory:{category}, processing ...",
|
||||
ts.timestamp()
|
||||
);
|
||||
return true;
|
||||
}).collect::<Vec<_>>();
|
||||
let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
|
||||
info!("Time spent finding pages to parse:{total_query_time} ms");
|
||||
|
||||
// Say we are going to parse the pages.
|
||||
let query_start_time = Instant::now();
|
||||
let pages = to_parse.iter().map(|p| Page {
|
||||
timestamp: chrono::DateTime::from_timestamp(*p, 0).unwrap(),
|
||||
category: category.to_string(),
|
||||
status: crate::db::ParsedPageStatus::PendingParse,
|
||||
});
|
||||
for p in pages {
|
||||
p.add_or_update(&db);
|
||||
}
|
||||
let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
|
||||
info!("Time spent inserting pages marked as ready to parse:{total_query_time} ms");
|
||||
}
|
||||
|
||||
pub fn parse_pages(db: &rusqlite::Connection, dir: &Path, category: &str, batch: usize) {
|
||||
let to_parse = Page::lookup_status(&db, ParsedPageStatus::PendingParse, category, batch);
|
||||
// For each page, read the file and parse it.
|
||||
let query_start_time = Instant::now();
|
||||
let to_add = to_parse
|
||||
.par_iter()
|
||||
.map(|p| {
|
||||
let page_path = dir
|
||||
.join(category)
|
||||
.join(format!("{}.html", p.timestamp.timestamp()));
|
||||
let page_contents = std::fs::read_to_string(&page_path)
|
||||
.inspect_err(|e| error!("Failed reading {}, error:{e}", page_path.display()))
|
||||
.ok()?;
|
||||
let ts = chrono::DateTime::from_timestamp(*p, 0).unwrap();
|
||||
let paged_info = ParsedPage {
|
||||
timestamp: ts,
|
||||
category: category.to_string(),
|
||||
};
|
||||
|
||||
let page_contents = read_timestamp_from_dir(dir, &ts)?;
|
||||
let elements =
|
||||
parser_ebay::parse_from_ebay_page(&page_contents, &p.timestamp, &category).unwrap();
|
||||
parser_ebay::parse_from_ebay_page(&page_contents, &ts, &category).unwrap();
|
||||
info!(
|
||||
"Page Timestamp:{} Catagory:{category}, found {} elements",
|
||||
p.timestamp.timestamp(),
|
||||
ts.timestamp(),
|
||||
elements.len()
|
||||
);
|
||||
|
||||
Some((p, elements))
|
||||
Some((paged_info, elements))
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
|
||||
@@ -126,6 +140,7 @@ pub fn parse_pages(db: &rusqlite::Connection, dir: &Path, category: &str, batch:
|
||||
|
||||
// And lastly add it to our database!
|
||||
let query_start_time = Instant::now();
|
||||
let mut added_count = 0;
|
||||
for iter in to_add {
|
||||
if iter.is_none() {
|
||||
continue;
|
||||
@@ -134,6 +149,7 @@ pub fn parse_pages(db: &rusqlite::Connection, dir: &Path, category: &str, batch:
|
||||
paged_info.add_or_update(&db);
|
||||
|
||||
for e in elements {
|
||||
added_count += 1;
|
||||
e.0.add_or_update(&db);
|
||||
e.1.add_or_update(&db);
|
||||
debug!(
|
||||
@@ -146,4 +162,6 @@ pub fn parse_pages(db: &rusqlite::Connection, dir: &Path, category: &str, batch:
|
||||
}
|
||||
let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
|
||||
info!("Time spent adding parsed pages: {total_query_time} ms");
|
||||
|
||||
return Some(added_count);
|
||||
}
|
||||
|
@@ -172,12 +172,22 @@ pub fn parse_from_ebay_page(
|
||||
mod tests {
|
||||
use super::*;
|
||||
use similar_asserts::assert_eq;
|
||||
use std::io::Read;
|
||||
use zstd;
|
||||
|
||||
#[test_log::test]
|
||||
fn parse() {
|
||||
let timestamp = chrono::DateTime::from_timestamp(1750369463, 0).unwrap();
|
||||
let html = include_str!("../test_data/scraper/raw_scraped/ssd/1750369463.html");
|
||||
let parsed = parse_from_ebay_page(html, ×tamp, "ssd").unwrap();
|
||||
let zstd = include_bytes!("../test_data/scraper/raw_scraped/ssd/1750369463.html.zst");
|
||||
let cursor = std::io::Cursor::new(zstd);
|
||||
|
||||
let mut html = String::new();
|
||||
zstd::Decoder::new(cursor)
|
||||
.unwrap()
|
||||
.read_to_string(&mut html)
|
||||
.unwrap();
|
||||
|
||||
let parsed = parse_from_ebay_page(&html, ×tamp, "ssd").unwrap();
|
||||
// assert_eq!(parsed.len(), 62);
|
||||
|
||||
let parsed = parsed.first_chunk::<10>().unwrap();
|
||||
|
@@ -32,7 +32,7 @@ pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage {
|
||||
let upper_title = title.to_uppercase();
|
||||
let mut total_gb = 0i64;
|
||||
let mut quantity = 1i64;
|
||||
let mut needed_description_check = false;
|
||||
let mut failed_reason = String::new();
|
||||
let mut individual_size_gb = 0i64;
|
||||
|
||||
for pattern in EXPLICIT_QTY_PATTERNS.iter() {
|
||||
@@ -68,36 +68,35 @@ pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage {
|
||||
if !unique_sizes_gb.is_empty() {
|
||||
individual_size_gb = unique_sizes_gb[0];
|
||||
if unique_sizes_gb.len() > 1 {
|
||||
needed_description_check = true;
|
||||
failed_reason = "Mixed Sizes".to_owned();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if SIZE_RANGE_REGEX.is_match(&upper_title) {
|
||||
needed_description_check = true;
|
||||
failed_reason = "No Size Given".to_owned();
|
||||
}
|
||||
if quantity > 1 && upper_title.contains("MIXED") {
|
||||
needed_description_check = true;
|
||||
failed_reason = "Mixed Sizes".to_owned();
|
||||
}
|
||||
if upper_title.contains("CHECK THE DESCRIPTION")
|
||||
|| upper_title.contains("CHECK DESCRIPTION")
|
||||
|| upper_title.contains("SEE DESCRIPTION")
|
||||
{
|
||||
if quantity > 1 || size_matches.is_empty() || size_matches.len() > 1 {
|
||||
needed_description_check = true;
|
||||
failed_reason = "Mixed Sizes".to_owned();
|
||||
}
|
||||
}
|
||||
if upper_title.contains("READ") {
|
||||
failed_reason = "Mixed Sizes".to_owned();
|
||||
}
|
||||
|
||||
if individual_size_gb > 0 {
|
||||
total_gb = individual_size_gb * quantity;
|
||||
}
|
||||
|
||||
if quantity > 1 && total_gb == 0 && !size_matches.is_empty() {
|
||||
needed_description_check = true;
|
||||
}
|
||||
|
||||
if quantity == 1 && size_matches.len() == 1 && !needed_description_check {
|
||||
// This condition is implicitly handled
|
||||
failed_reason = "No size given".to_owned();
|
||||
}
|
||||
|
||||
ParsedStorage {
|
||||
@@ -106,8 +105,8 @@ pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage {
|
||||
total_gigabytes: total_gb,
|
||||
quantity,
|
||||
individual_size_gigabytes: individual_size_gb,
|
||||
needed_description_check,
|
||||
parse_engine: 0,
|
||||
failed_reason: failed_reason,
|
||||
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -125,8 +124,8 @@ mod tests {
|
||||
total_gigabytes: 512 * 3,
|
||||
quantity: 3,
|
||||
individual_size_gigabytes: 512,
|
||||
parse_engine: 0,
|
||||
needed_description_check: false,
|
||||
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
|
||||
failed_reason: String::new(),
|
||||
},
|
||||
true,
|
||||
),
|
||||
@@ -138,8 +137,8 @@ mod tests {
|
||||
total_gigabytes: 240,
|
||||
quantity: 1,
|
||||
individual_size_gigabytes: 240,
|
||||
parse_engine: 0,
|
||||
needed_description_check: false,
|
||||
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
|
||||
failed_reason: String::new(),
|
||||
},
|
||||
true,
|
||||
),
|
||||
@@ -151,8 +150,8 @@ mod tests {
|
||||
total_gigabytes: 1024,
|
||||
quantity: 1,
|
||||
individual_size_gigabytes: 1024,
|
||||
parse_engine: 0,
|
||||
needed_description_check: true,
|
||||
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
|
||||
failed_reason: String::new(),
|
||||
},
|
||||
false, // Sadly this one fails :/
|
||||
),
|
||||
@@ -164,8 +163,8 @@ mod tests {
|
||||
total_gigabytes: 7 * 1024,
|
||||
quantity: 1,
|
||||
individual_size_gigabytes: 7 * 1024,
|
||||
parse_engine: 0,
|
||||
needed_description_check: false,
|
||||
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
|
||||
failed_reason: String::new(),
|
||||
},
|
||||
true,
|
||||
),
|
||||
@@ -177,8 +176,8 @@ mod tests {
|
||||
total_gigabytes: 6 * 256,
|
||||
quantity: 6,
|
||||
individual_size_gigabytes: 256,
|
||||
parse_engine: 0,
|
||||
needed_description_check: false,
|
||||
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
|
||||
failed_reason: String::new(),
|
||||
},
|
||||
true,
|
||||
),
|
||||
@@ -190,8 +189,8 @@ mod tests {
|
||||
total_gigabytes: 1966,
|
||||
quantity: 1,
|
||||
individual_size_gigabytes: 1966,
|
||||
parse_engine: 0,
|
||||
needed_description_check: false,
|
||||
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
|
||||
failed_reason: String::new(),
|
||||
},
|
||||
true,
|
||||
),
|
160
src/parser_storage_e1.rs
Normal file
160
src/parser_storage_e1.rs
Normal file
@@ -0,0 +1,160 @@
|
||||
use crate::db::{
|
||||
ParsedLLMStorageResult, ParsedStorage, StorageLLMVersion, StorageParsingEngineVersion,
|
||||
};
|
||||
use reqwest::header::{AUTHORIZATION, CONTENT_TYPE};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::json;
|
||||
use tracing::error;
|
||||
|
||||
// Given this prompt and a string of "(Lot of 6) Samsung MZ-VLB2560 256GB M.2NVMe Internal SSD
|
||||
// (MZVLB256HBHQ-000H1)", we get 338 input tokens and 36 output tokens. Assuming no caching, then
|
||||
// Gemini 2.5 Flash Lite at $0.10/M input and $0.40/M output, this would cost $0.0000338 Input,
|
||||
// $0.0000144 Output, and $0.0000482 Total. Given 30,000 listings this would be $1.446.
|
||||
const SYSTEM_PROMPT: &str = r#"
|
||||
You will be given a product listing for one or more storage drives. You will return *ONLY* JSON strictly adhering to the same structure and key names as below. This means no backticks or markdown/markup. You will specify how many storage drives are included in the listing as a number (1, 2, 3, etc), the size in gigabytes of each drive as a number (rounding up if needed, so 1, 2, 3, etc), and lastly if the above cannot be provided due the the listing title being incomplete or confusing, a very short reason why.
|
||||
|
||||
Here is an example for a title of "Lot of 2, Crucial P3 Plus 1TB NVMe Internal M.2 SSD (CT1000P3PSSD8) - Barely used!";
|
||||
{
|
||||
"quantity": 2,
|
||||
"gigabytes": 1024
|
||||
"fail_reason": ""
|
||||
}
|
||||
|
||||
And an example for an unclear title of "Pallet of assorted 128GB to 5TB drives";
|
||||
{
|
||||
"quantity": 0,
|
||||
"gigabytes": 0,
|
||||
"fail_reason": "multiple mixed sizes"
|
||||
}
|
||||
"#;
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, PartialEq, Clone)]
|
||||
struct LLMParsedResponse {
|
||||
pub quantity: i64,
|
||||
pub gigabytes: i64,
|
||||
pub fail_reason: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug)]
|
||||
struct OpenAIResponse {
|
||||
choices: Vec<OpenAIChoice>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug)]
|
||||
struct OpenAIChoice {
|
||||
message: OpenAIMessage,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug)]
|
||||
struct OpenAIMessage {
|
||||
content: String,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
const OPENAI_LLM_URL: &str = "https://badurl.hak8or.com/litellm_api/chat/completions";
|
||||
#[cfg(not(test))]
|
||||
const OPENAI_LLM_URL: &str = "https://ai.hak8or.com/litellm_api/chat/completions";
|
||||
|
||||
#[cfg(test)]
|
||||
const OPENAI_LLM_API_KEY: &str = "Bearer sk-YmVlcC1ib29wLWEtcm9ib3Q";
|
||||
#[cfg(not(test))]
|
||||
const OPENAI_LLM_API_KEY: &str = "Bearer sk-HMGML94x2ag6ggOoDghSGA";
|
||||
|
||||
pub async fn parse_size_and_quantity_llm(
|
||||
item_id: i64,
|
||||
title: &str,
|
||||
) -> Option<ParsedLLMStorageResult> {
|
||||
let client = reqwest::Client::new();
|
||||
let req = client
|
||||
.post(OPENAI_LLM_URL)
|
||||
.header(CONTENT_TYPE, actix_web::mime::APPLICATION_JSON.to_string())
|
||||
.header(AUTHORIZATION, OPENAI_LLM_API_KEY)
|
||||
.body(
|
||||
json!({
|
||||
"model": "gemini-2.5-flash-lite",
|
||||
"reasoning_effort": "disable",
|
||||
"thinking": {"type": "disabled", "budget_tokens": 0},
|
||||
"messages": [
|
||||
{ "role": "system", "content": SYSTEM_PROMPT },
|
||||
{ "role": "user", "content": title }
|
||||
]
|
||||
})
|
||||
.to_string(),
|
||||
);
|
||||
let reply_body = req.send().await.ok()?.text().await.ok()?;
|
||||
let repl_json: OpenAIResponse = serde_json::from_str(&reply_body).ok()?;
|
||||
match repl_json.choices.len() {
|
||||
0 => {
|
||||
error!("When parsing title, LLM returned ZERO choices");
|
||||
return None;
|
||||
}
|
||||
1 => { /* Nothing to do */ }
|
||||
a => error!("When parsing title, LLM returned {a}, >1 choices, using first!"),
|
||||
}
|
||||
let reply_parsed_storage_json: LLMParsedResponse =
|
||||
serde_json::from_str(&repl_json.choices[0].message.content).ok()?;
|
||||
|
||||
if !reply_parsed_storage_json.fail_reason.is_empty() {
|
||||
error!(
|
||||
"Failed parsing item_id:{item_id}, title:{title}, due to reason:{}",
|
||||
reply_parsed_storage_json.fail_reason
|
||||
);
|
||||
}
|
||||
|
||||
Some(ParsedLLMStorageResult {
|
||||
id: 0,
|
||||
fail_reason: reply_parsed_storage_json.fail_reason.clone(),
|
||||
gigabytes: reply_parsed_storage_json.gigabytes,
|
||||
item_id,
|
||||
quantity: reply_parsed_storage_json.quantity,
|
||||
title: title.to_owned(),
|
||||
llm_id: StorageLLMVersion::Gemini2d5Prompt0,
|
||||
})
|
||||
}
|
||||
|
||||
// Since we can't have a hashmap in a const, and I don't want to play with
|
||||
// making our parsed result struct contain a CoW string for fail_reason and
|
||||
// title, we are stuck with this ...
|
||||
pub fn parse_cached(item_id: i64, title: &str) -> Option<ParsedLLMStorageResult> {
|
||||
match title {
|
||||
"Lot of 2 512GB SSD 6gb/s working with 5% wear" => Some(ParsedLLMStorageResult {
|
||||
id: 0,
|
||||
item_id: item_id,
|
||||
fail_reason: "".to_string(),
|
||||
gigabytes: 512,
|
||||
quantity: 2,
|
||||
title: title.to_owned(),
|
||||
llm_id: StorageLLMVersion::Testing,
|
||||
}),
|
||||
"Lot of 2 assorted SSD" => Some(ParsedLLMStorageResult {
|
||||
id: 0,
|
||||
fail_reason: "mixed sizes".to_owned(),
|
||||
gigabytes: 0,
|
||||
item_id,
|
||||
quantity: 0,
|
||||
title: title.to_owned(),
|
||||
llm_id: StorageLLMVersion::Testing,
|
||||
}),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Parses size and quantity information from an item title.
|
||||
pub async fn parse_size_and_quantity(
|
||||
db: &rusqlite::Connection,
|
||||
item_id: i64,
|
||||
title: &str,
|
||||
) -> Option<ParsedStorage> {
|
||||
let plsr = parse_size_and_quantity_llm(item_id, title).await?;
|
||||
plsr.add_or_update(&db);
|
||||
|
||||
Some(ParsedStorage {
|
||||
id: 0,
|
||||
item: item_id,
|
||||
total_gigabytes: plsr.quantity * plsr.gigabytes,
|
||||
quantity: plsr.quantity,
|
||||
individual_size_gigabytes: plsr.gigabytes,
|
||||
failed_reason: plsr.fail_reason,
|
||||
parse_engine: StorageParsingEngineVersion::LLM,
|
||||
})
|
||||
}
|
@@ -5,6 +5,10 @@ URL_PER_PAGE_240="&_ipg=240"
|
||||
URL_MIN_PRICE_USD_60="&_udlo=60.00"
|
||||
URL_SEARCHTERM_NONE="&_nkw="
|
||||
URL_LOCATION_NORTHAMERICA="&LH_PrefLoc=3"
|
||||
URL_SORTBY_NEWLY_LISTED="&_sop=10"
|
||||
URL_SORTBY_ENDING_SOONEST="&_sop=1"
|
||||
URL_CATEGORY_MINIPC_ALLINONE="&_sacat=179"
|
||||
URL_CATEGORY_SSD="&_sacat=175669"
|
||||
URL_BASE_LISTING="https://www.ebay.com/sch/i.html?"
|
||||
|
||||
if [ -z "${XDG_DATA_HOME}" ]; then
|
||||
@@ -12,20 +16,141 @@ if [ -z "${XDG_DATA_HOME}" ]; then
|
||||
exit
|
||||
fi
|
||||
|
||||
DIR_SSDS="$XDG_DATA_HOME/ebay_scraper/raw_scraped/ssd"
|
||||
mkdir -p "$DIR_SSDS"
|
||||
if [ ! -s "$DIR_SSDS/url.json" ]; then
|
||||
URL_CATEGORY_SSD="&_sacat=175669"
|
||||
URL_SSDS="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_SSD&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1&_sop=10$URL_PER_PAGE_240"
|
||||
echo "{\"url\": \"$URL_SSDS\"}" > "$DIR_SSDS/url.json"
|
||||
fi
|
||||
wget -o "$DIR_SSDS/$(date +%s).html" "$(jq '.url' $DIR_SSDS/url.json)"
|
||||
# Heh, so Ebay started to block my scraping efforts after a while. I couldn't
|
||||
# get this working with wget, so in the end I decided to go for this route which
|
||||
# is quite ugly, but should work in the end. We effectively run a non headless
|
||||
# version of a browser with various realistic headers and screen dimensions.
|
||||
# Lastly, we give the page an extra 5 seconds to run any potential javascript
|
||||
# often used to counter scraping or bots.
|
||||
fetch_compress_save_html() {
|
||||
local url="$1"
|
||||
local output_file="$2"
|
||||
|
||||
DIR_MINIPC="$XDG_DATA_HOME/ebay_scraper/raw_scraped/minipc"
|
||||
mkdir -p "$DIR_MINIPC"
|
||||
if [ ! -s "$DIR_MINIPC/url.json" ]; then
|
||||
URL_CATEGORY_MINIPC_ALLINONE="&_sacat=179"
|
||||
URL_MINIPC="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_MINIPC_ALLINONE&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1&_sop=10$URL_PER_PAGE_240"
|
||||
echo "{\"url\": \"$URL_MINIPC\"}" > "$DIR_MINIPC/url.json"
|
||||
fi
|
||||
wget -o "$DIR_MINIPC/$(date +%s).html" "$(jq '.url' $DIR_MINIPC/url.json)"
|
||||
echo Fetching $url
|
||||
xvfb-run --server-args="-screen 0 1024x768x24" \
|
||||
uv run --with playwright --with playwright-stealth - $url <<'EOF' | zstd -z --ultra -19 -o $output_file
|
||||
import asyncio
|
||||
import sys
|
||||
from playwright.async_api import async_playwright
|
||||
from playwright_stealth import Stealth
|
||||
|
||||
async def main():
|
||||
async with Stealth().use_async(async_playwright()) as p:
|
||||
browser = await p.chromium.launch(
|
||||
executable_path='/usr/bin/chromium',
|
||||
args=[
|
||||
'--no-sandbox',
|
||||
'--disable-dev-shm-usage',
|
||||
'--disable-blink-features=AutomationControlled',
|
||||
"--window-size=1901,1018"
|
||||
],
|
||||
headless=False
|
||||
)
|
||||
# Create context with user agent
|
||||
context = await browser.new_context(
|
||||
color_scheme=r"light",
|
||||
locale=r"en-US,en;q=0.9",
|
||||
user_agent=r"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36",
|
||||
timezone_id=r"America/New_York",
|
||||
extra_http_headers={
|
||||
"origin": "https://www.ebay.com",
|
||||
"accept": "*/*",
|
||||
"accept-encoding": "gzip, deflate, br, zstd",
|
||||
"cache-control": "no-cache",
|
||||
"accept-language": "en-US,en;q=0.9"
|
||||
}
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
await page.goto(sys.argv[1], wait_until="domcontentloaded")
|
||||
await page.wait_for_timeout(5000)
|
||||
print(await page.content())
|
||||
await browser.close()
|
||||
|
||||
asyncio.run(main())
|
||||
EOF
|
||||
}
|
||||
|
||||
fetch() {
|
||||
local name="$1"
|
||||
local url_param="$2"
|
||||
|
||||
DIR="$XDG_DATA_HOME/scraper/raw_scraped/$name"
|
||||
mkdir -p "$DIR"
|
||||
if [ ! -s "$DIR/url.json" ]; then
|
||||
local URL="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$url_param&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1$URL_PER_PAGE_240"
|
||||
echo "{\"url\": \"$URL\"}" > "$DIR/url.json"
|
||||
fi
|
||||
|
||||
URL_NEWEST="$(jq '.url' $DIR/url.json --raw-output)$URL_SORTBY_NEWLY_LISTED"
|
||||
fetch_compress_save_html $URL_NEWEST "$DIR/$(date +%s).html.zst"
|
||||
|
||||
URL_ENDING="$(jq '.url' $DIR/url.json --raw-output)$URL_SORTBY_ENDING_SOONEST"
|
||||
fetch_compress_save_html $URL_ENDING "$DIR/$(date +%s).html.zst"
|
||||
}
|
||||
|
||||
fetch "ssd" "$URL_CATEGORY_SSD"
|
||||
fetch "minipc" "$URL_CATEGORY_MINIPC_ALLINONE"
|
||||
|
||||
|
||||
# If needing to do a mass compression;
|
||||
# fd '\.html$' -x zstd -z --ultra -19 -o {}.zst {}
|
||||
|
||||
# If needing to purge bogus downloads
|
||||
# fd --size -100K .html.zst -x ls -lah {}
|
||||
# fd --size -100K .html.zst -x rm {}
|
||||
|
||||
# Level compression analysis;
|
||||
#
|
||||
# A single scraped result;
|
||||
# for lvl in $(seq 3 22); zstd --compress --ultra -o 1755012328.html.zst$lvl -$lvl 1755012328.html; end
|
||||
# 1755012328.html : 9.04% ( 2.60 MiB => 240 KiB, 1755012328.html.zst3)
|
||||
# 1755012328.html : 9.04% ( 2.60 MiB => 240 KiB, 1755012328.html.zst4)
|
||||
# 1755012328.html : 8.80% ( 2.60 MiB => 234 KiB, 1755012328.html.zst5)
|
||||
# 1755012328.html : 8.58% ( 2.60 MiB => 228 KiB, 1755012328.html.zst6)
|
||||
# 1755012328.html : 8.54% ( 2.60 MiB => 227 KiB, 1755012328.html.zst7)
|
||||
# 1755012328.html : 8.45% ( 2.60 MiB => 225 KiB, 1755012328.html.zst8)
|
||||
# 1755012328.html : 8.34% ( 2.60 MiB => 222 KiB, 1755012328.html.zst9)
|
||||
# 1755012328.html : 8.30% ( 2.60 MiB => 221 KiB, 1755012328.html.zst10)
|
||||
# 1755012328.html : 8.28% ( 2.60 MiB => 220 KiB, 1755012328.html.zst11)
|
||||
# 1755012328.html : 8.28% ( 2.60 MiB => 220 KiB, 1755012328.html.zst12)
|
||||
# 1755012328.html : 8.32% ( 2.60 MiB => 221 KiB, 1755012328.html.zst13)
|
||||
# 1755012328.html : 8.29% ( 2.60 MiB => 221 KiB, 1755012328.html.zst14)
|
||||
# 1755012328.html : 8.25% ( 2.60 MiB => 219 KiB, 1755012328.html.zst15)
|
||||
# 1755012328.html : 8.28% ( 2.60 MiB => 220 KiB, 1755012328.html.zst16)
|
||||
# 1755012328.html : 8.20% ( 2.60 MiB => 218 KiB, 1755012328.html.zst17)
|
||||
# 1755012328.html : 8.23% ( 2.60 MiB => 219 KiB, 1755012328.html.zst18)
|
||||
# 1755012328.html : 7.99% ( 2.60 MiB => 213 KiB, 1755012328.html.zst19)
|
||||
# 1755012328.html : 7.99% ( 2.60 MiB => 213 KiB, 1755012328.html.zst20)
|
||||
# 1755012328.html : 7.93% ( 2.60 MiB => 211 KiB, 1755012328.html.zst21)
|
||||
# 1755012328.html : 7.91% ( 2.60 MiB => 211 KiB, 1755012328.html.zst22)
|
||||
#
|
||||
# Lets see if we get benefits tar'ing and them compressing;
|
||||
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755012328.html
|
||||
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755012331.html
|
||||
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755015932.html
|
||||
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755015929.html
|
||||
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755019567.html
|
||||
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755019564.html
|
||||
# -rw-r--r-- 1 hak8or users 16M Sep 1 12:23 175501.tar
|
||||
# ➜ for lvl in $(seq 3 22); zstd --compress --ultra -o 175501.tar.$lvl -$lvl 175501.tar; end
|
||||
# 175501.tar : 8.91% ( 15.6 MiB => 1.39 MiB, 175501.tar.3)
|
||||
# 175501.tar : 8.92% ( 15.6 MiB => 1.39 MiB, 175501.tar.4)
|
||||
# 175501.tar : 8.65% ( 15.6 MiB => 1.35 MiB, 175501.tar.5)
|
||||
# 175501.tar : 8.42% ( 15.6 MiB => 1.31 MiB, 175501.tar.6)
|
||||
# 175501.tar : 8.36% ( 15.6 MiB => 1.30 MiB, 175501.tar.7)
|
||||
# 175501.tar : 8.25% ( 15.6 MiB => 1.28 MiB, 175501.tar.8)
|
||||
# 175501.tar : 5.36% ( 15.6 MiB => 854 KiB, 175501.tar.9)
|
||||
# 175501.tar : 5.32% ( 15.6 MiB => 847 KiB, 175501.tar.10)
|
||||
# 175501.tar : 5.30% ( 15.6 MiB => 844 KiB, 175501.tar.11)
|
||||
# 175501.tar : 5.30% ( 15.6 MiB => 844 KiB, 175501.tar.12)
|
||||
# 175501.tar : 5.48% ( 15.6 MiB => 872 KiB, 175501.tar.13)
|
||||
# 175501.tar : 5.42% ( 15.6 MiB => 864 KiB, 175501.tar.14)
|
||||
# 175501.tar : 5.19% ( 15.6 MiB => 828 KiB, 175501.tar.15)
|
||||
# 175501.tar : 5.31% ( 15.6 MiB => 845 KiB, 175501.tar.16)
|
||||
# 175501.tar : 5.01% ( 15.6 MiB => 798 KiB, 175501.tar.17)
|
||||
# 175501.tar : 5.04% ( 15.6 MiB => 803 KiB, 175501.tar.18)
|
||||
# 175501.tar : 4.84% ( 15.6 MiB => 771 KiB, 175501.tar.19)
|
||||
# 175501.tar : 4.79% ( 15.6 MiB => 764 KiB, 175501.tar.20)
|
||||
# 175501.tar : 4.74% ( 15.6 MiB => 755 KiB, 175501.tar.21)
|
||||
# 175501.tar : 4.73% ( 15.6 MiB => 753 KiB, 175501.tar.22)
|
||||
|
File diff suppressed because one or more lines are too long
BIN
test_data/scraper/raw_scraped/ssd/1750369463.html.zst
Normal file
BIN
test_data/scraper/raw_scraped/ssd/1750369463.html.zst
Normal file
Binary file not shown.
Reference in New Issue
Block a user