diff --git a/Cargo.lock b/Cargo.lock index 1f90052..808455b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -39,8 +39,8 @@ dependencies = [ "flate2", "foldhash", "futures-core", - "h2", - "http", + "h2 0.3.26", + "http 0.2.12", "httparse", "httpdate", "itoa", @@ -76,7 +76,7 @@ checksum = "13d324164c51f63867b57e73ba5936ea151b8a41a1d23d1031eeb9f70d0236f8" dependencies = [ "bytestring", "cfg-if", - "http", + "http 0.2.12", "regex", "regex-lite", "serde", @@ -289,6 +289,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + [[package]] name = "autocfg" version = "1.5.0" @@ -491,6 +497,16 @@ dependencies = [ "version_check", ] +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "core-foundation-sys" version = "0.8.7" @@ -682,6 +698,7 @@ dependencies = [ "lazy_static", "rayon", "regex", + "reqwest", "rusqlite", "scraper", "serde", @@ -747,6 +764,16 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" +[[package]] +name = "errno" +version = "0.3.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad" +dependencies = [ + "libc", + "windows-sys 0.59.0", +] + [[package]] name = "fallible-iterator" version = "0.3.0" @@ -759,6 +786,12 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7360491ce676a36bf9bb3c56c1aa791658183a54d2744120f27285738d90465a" +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + [[package]] name = "flate2" version = "1.1.2" @@ -781,6 +814,21 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.2.1" @@ -800,12 +848,28 @@ dependencies = [ "new_debug_unreachable", ] +[[package]] +name = "futures-channel" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +dependencies = [ + "futures-core", + "futures-sink", +] + [[package]] name = "futures-core" version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" +[[package]] +name = "futures-io" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" + [[package]] name = "futures-sink" version = "0.3.31" @@ -825,9 +889,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ "futures-core", + "futures-io", + "futures-sink", "futures-task", + "memchr", "pin-project-lite", "pin-utils", + "slab", ] [[package]] @@ -898,7 +966,26 @@ dependencies = [ "futures-core", "futures-sink", "futures-util", - "http", + "http 0.2.12", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "h2" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3c0b69cfcb4e1b9f1bf2f53f95f766e4661169728ec61cd3fe5a0166f2d1386" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http 1.3.1", "indexmap", "slab", "tokio", @@ -953,6 +1040,40 @@ dependencies = [ "itoa", ] +[[package]] +name = "http" +version = "1.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4a85d31aea989eead29a3aaf9e1115a180df8282431156e533de47660892565" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http 1.3.1", +] + +[[package]] +name = "http-body-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" +dependencies = [ + "bytes", + "futures-core", + "http 1.3.1", + "http-body", + "pin-project-lite", +] + [[package]] name = "httparse" version = "1.10.1" @@ -965,6 +1086,86 @@ version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" +[[package]] +name = "hyper" +version = "1.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb3aa54a13a0dfe7fbe3a59e0c76093041720fdc77b110cc0fc260fafb4dc51e" +dependencies = [ + "atomic-waker", + "bytes", + "futures-channel", + "futures-core", + "h2 0.4.12", + "http 1.3.1", + "http-body", + "httparse", + "itoa", + "pin-project-lite", + "pin-utils", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" +dependencies = [ + "http 1.3.1", + "hyper", + "hyper-util", + "rustls", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tower-service", +] + +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d9b05277c7e8da2c93a568989bb6207bef0112e8d17df7a6eda4a3cf143bc5e" +dependencies = [ + "base64", + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "http 1.3.1", + "http-body", + "hyper", + "ipnet", + "libc", + "percent-encoding", + "pin-project-lite", + "socket2", + "system-configuration", + "tokio", + "tower-service", + "tracing", + "windows-registry", +] + [[package]] name = "iana-time-zone" version = "0.1.63" @@ -1112,6 +1313,22 @@ dependencies = [ "hashbrown", ] +[[package]] +name = "ipnet" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" + +[[package]] +name = "iri-string" +version = "0.7.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbc5ebe9c3a1a7a5127f920a418f7585e9e758e911d0466ed004f393b0e380b2" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.1" @@ -1183,6 +1400,12 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "linux-raw-sys" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd945864f07fe9f5371a27ad7b52a172b4b499999f1d97574c9fa68373937e12" + [[package]] name = "litemap" version = "0.8.0" @@ -1295,6 +1518,23 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "native-tls" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + [[package]] name = "new_debug_unreachable" version = "1.0.6" @@ -1347,6 +1587,50 @@ version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" +[[package]] +name = "openssl" +version = "0.10.73" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8505734d46c8ab1e19a1dce3aef597ad87dcb4c37e7188231769bd6bd51cebf8" +dependencies = [ + "bitflags", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "openssl-probe" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" + +[[package]] +name = "openssl-sys" +version = "0.9.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90096e2e47630d78b7d1c20952dc621f957103f8bc2c8359ec81290d75238571" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "option-ext" version = "0.2.0" @@ -1646,6 +1930,62 @@ version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" +[[package]] +name = "reqwest" +version = "0.12.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d429f34c8092b2d42c7c93cec323bb4adeb7c67698f70839adec842ec10c7ceb" +dependencies = [ + "base64", + "bytes", + "encoding_rs", + "futures-channel", + "futures-core", + "futures-util", + "h2 0.4.12", + "http 1.3.1", + "http-body", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-tls", + "hyper-util", + "js-sys", + "log", + "mime", + "native-tls", + "percent-encoding", + "pin-project-lite", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-native-tls", + "tower", + "tower-http", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.16", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + [[package]] name = "rusqlite" version = "0.36.0" @@ -1667,6 +2007,52 @@ version = "0.1.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "989e6739f80c4ad5b13e0fd7fe89531180375b18520cc8c82080e4dc4035b84f" +[[package]] +name = "rustix" +version = "1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11181fbabf243db407ef8df94a6ce0b2f9a733bd8be4ad02b4eda9602296cac8" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys", + "windows-sys 0.59.0", +] + +[[package]] +name = "rustls" +version = "0.23.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0ebcbd2f03de0fc1122ad9bb24b127a5a6cd51d72604a3f3c50ac459762b6cc" +dependencies = [ + "once_cell", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pki-types" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "229a4a4c221013e7e1f1a043678c5cc39fe5171437c88fb47151a21e6f5b5c79" +dependencies = [ + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a17884ae0c1b773f1ccd2bd4a8c72f16da897310a98b0e84bf349ad5ead92fc" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + [[package]] name = "rustversion" version = "1.0.21" @@ -1679,6 +2065,15 @@ version = "1.0.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" +[[package]] +name = "schannel" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f29ebaa345f945cec9fbbc532eb307f0fdad8161f281b6369539c8d84876b3d" +dependencies = [ + "windows-sys 0.59.0", +] + [[package]] name = "scopeguard" version = "1.2.0" @@ -1700,6 +2095,29 @@ dependencies = [ "tendril", ] +[[package]] +name = "security-framework" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" +dependencies = [ + "bitflags", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49db231d56a190491cb4aeda9527f1ad45345af50b0851622a7adb8c03b01c32" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "selectors" version = "0.26.0" @@ -1892,6 +2310,12 @@ version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + [[package]] name = "syn" version = "2.0.103" @@ -1903,6 +2327,15 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" +dependencies = [ + "futures-core", +] + [[package]] name = "synstructure" version = "0.13.2" @@ -1914,6 +2347,40 @@ dependencies = [ "syn", ] +[[package]] +name = "system-configuration" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" +dependencies = [ + "bitflags", + "core-foundation", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "tempfile" +version = "3.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15b61f8f20e3a6f7e0649d825294eaf317edce30f82cf6026e7e4cb9222a7d1e" +dependencies = [ + "fastrand", + "getrandom 0.3.3", + "once_cell", + "rustix", + "windows-sys 0.59.0", +] + [[package]] name = "tendril" version = "0.4.3" @@ -2034,6 +2501,26 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + +[[package]] +name = "tokio-rustls" +version = "0.26.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e727b36a1a0e8b74c376ac2211e40c2c8af09fb4013c60d910495810f008e9b" +dependencies = [ + "rustls", + "tokio", +] + [[package]] name = "tokio-util" version = "0.7.15" @@ -2047,6 +2534,51 @@ dependencies = [ "tokio", ] +[[package]] +name = "tower" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tokio", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-http" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "adc82fd73de2a9722ac5da747f12383d2bfdb93591ee6c58486e0097890f05f2" +dependencies = [ + "bitflags", + "bytes", + "futures-util", + "http 1.3.1", + "http-body", + "iri-string", + "pin-project-lite", + "tower", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + [[package]] name = "tracing" version = "0.1.41" @@ -2109,6 +2641,12 @@ dependencies = [ "tracing-log", ] +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + [[package]] name = "typenum" version = "1.18.0" @@ -2139,6 +2677,12 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + [[package]] name = "url" version = "2.5.4" @@ -2186,6 +2730,15 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + [[package]] name = "wasi" version = "0.11.1+wasi-snapshot-preview1" @@ -2227,6 +2780,19 @@ dependencies = [ "wasm-bindgen-shared", ] +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "555d470ec0bc3bb57890405e5d4322cc9ea83cebb085523ced7be4144dac1e61" +dependencies = [ + "cfg-if", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "wasm-bindgen-macro" version = "0.2.100" @@ -2259,6 +2825,16 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "web-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "winapi" version = "0.3.9" @@ -2322,6 +2898,17 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" +[[package]] +name = "windows-registry" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b8a9ed28765efc97bbc954883f4e6796c33a06546ebafacbabee9696967499e" +dependencies = [ + "windows-link", + "windows-result", + "windows-strings", +] + [[package]] name = "windows-result" version = "0.3.4" @@ -2502,6 +3089,12 @@ dependencies = [ "synstructure", ] +[[package]] +name = "zeroize" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" + [[package]] name = "zerotrie" version = "0.2.2" diff --git a/Cargo.toml b/Cargo.toml index f91d056..335c736 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,7 @@ dirs = "6.0.0" lazy_static = "1.5.0" rayon = "1.10.0" regex = "1.11.1" +reqwest = { version = "0.12.23", features = ["blocking"] } rusqlite = { version = "0.36.0", features = ["bundled", "chrono"] } scraper = "0.23.1" serde = { version = "1.0.219", features = ["derive"] } diff --git a/readme.md b/readme.md index 5018f14..9af6777 100644 --- a/readme.md +++ b/readme.md @@ -10,3 +10,113 @@ echo run2 && http GET "$URL_BASE/listing/since/12345678/2" && \ echo run3 && http GET "$URL_BASE/listing/388484391867" && \ echo run4 && http GET "$URL_BASE/listing/286605201240/history" ``` + +And some jq usage for raw interaction of the data; +```bash +# Download a bunch of listings. +http https://scraper.hak8or.com/api/listings since==0 limit==20 > listings.json + +# Show what a single listing looks like. +listings.json | jq '.[0]' +{ + "listing": { + "id": 22563, + "item_id": 286707621236, + "title": "WD_BLACK SN770M 2TB M.2 NVMe Internal SSD (WDBDNH0020BBK-WRSN)", + "buy_it_now_price_cents": null, + "has_best_offer": false, + "image_url": "https://i.ebayimg.com/images/g/It4AAeSwzz5oddoa/s-l140.jpg" + }, + "history": [ + { + "item": 286707621236, + "timestamp": "2025-07-15T04:46:54Z", + "category": "ssd", + "current_bid_usd_cents": 12900 + } + ], + "parsed": [ + { + "id": 6, + "item": 286707621236, + "total_gigabytes": 2048, + "quantity": 1, + "individual_size_gigabytes": 2048, + "parse_engine": 0, + "needed_description_check": false + } + ] +} + +# Show the 1st and 2nd items, but only grab a few specific entries. +cat listings_small.json | jq '[.[1:3][] | { + item_id: .listing.item_id, + title: .listing.title, + parsed: .parsed[] | { + total_gigabytes, + quantity, + individual_size_gigabytes + } + }]' +[ + { + "item_id": 297545995095, + "title": "Crucial P3 Plus 1TB NVMe Internal M.2 SSD (CT1000P3PSSD8) - Barely used!", + "parsed": { + "total_gigabytes": 1024, + "quantity": 1, + "individual_size_gigabytes": 1024 + } + }, + { + "item_id": 127220979797, + "title": "Kingston NV2 2TB M.2 3500MG/S NVMe Internal SSD PCIe 4.0 Gen SNV2S/2000G C-/#qWT", + "parsed": { + "total_gigabytes": 2048, + "quantity": 1, + "individual_size_gigabytes": 2048 + } + } +] +``` + +And now a LLM based parse, such that the prompt is this (189 Tokens for Gemini 2.5 Flash Lite) +``` +I will provide you with a listing title I want you to analyse. Then you will tell me the total gigabytes of all drives listed in the listing, how many drives are specified in the title, and the gigabytes of each drive in the listing. Here is an example for a title of "Crucial P3 Plus 1TB NVMe Internal M.2 SSD (CT1000P3PSSD8) - Barely used!"; +``` +{ + "total_gigabytes": 1024, + "quantity": 1, + "individual_size_gigabytes": 1024 +} +``` +Reply with "OK" (and _only_ "OK") if you understand this. After you reply with that, I will provide you with a title, and then you will reply with solely the requested json (and ONLY said json). +``` + +And passing a title of (30 tokens); +``` +Lot Of 3 Western Digital PC SN740 512GB M.2 2230 NVMe Internal SSD +``` +returns the following json of (41 tokens); +```json +{ + "total_gigabytes": 1536, + "quantity": 3, + "individual_size_gigabytes": 512 +} +``` + +and another example of sending (49 tokens) +``` +(Lot of 6) Samsung MZ-VLB2560 256GB M.2 NVMe Internal SSD (MZVLB256HBHQ-000H1) +``` +returns the following json of (42 tokens); +```json +{ + "total_gigabytes": 1536, + "quantity": 6, + "individual_size_gigabytes": 256 +} +``` + +So for 1 listing we have a 189 Token "System Prompt", then a ~45 token title prompt, and 42 Token parsed reply. Given 30,000 listings, that's 5,670,000 Token "System Prompt" as Input, 1,350,000 Token Title prompt as Input, and 1,260,000 Token Parsed information (output). Assuming Gemini 2.5 Flash Mini which is $0.10/M for input and $0.40/M for output, would pay $0.702 for input and $0.504 for output, or $1.206 total. diff --git a/src/db.rs b/src/db.rs index f2ddfbe..e28f6f4 100644 --- a/src/db.rs +++ b/src/db.rs @@ -614,6 +614,101 @@ pub fn listings_get_filtered( listings } +// This is mostly meant as a way to cache all of these. +#[derive(Serialize, Debug, PartialEq, Clone)] +pub struct ParsedLLMStorageResult { + pub id: i64, + pub item_id: i64, + pub title: String, + pub quantity: i64, + pub gigabytes: i64, + pub fail_reason: String, +} +impl DBTable for ParsedLLMStorageResult { + const TABLE_NAME: &'static str = "ParsedLLMStorageResult"; + const TABLE_SCHEMA: &'static str = " + id INTEGER PRIMARY KEY, + item_id INTEGER NOT NULL UNIQUE, + title TEXT NOT NULL, + quantity INTEGER NOT NULL, + gigabytes INTEGER NOT NULL, + fail_reason TEXT NOT NULL + "; + + fn get_all(conn: &Connection) -> rusqlite::Result> { + let mut stmt = conn.prepare(&format!( + "SELECT id, item_id, title, quantity, gigabytes, fail_reason FROM {}", + Self::TABLE_NAME + ))?; + let iter = stmt.query_map([], |row| { + Ok(ParsedLLMStorageResult { + id: row.get(0)?, + item_id: row.get(1)?, + title: row.get(2)?, + quantity: row.get(3)?, + gigabytes: row.get(4)?, + fail_reason: row.get(5)?, + }) + })?; + + let mut result = Vec::new(); + for item in iter { + result.push(item?); + } + Ok(result) + } +} +impl ParsedLLMStorageResult { + pub fn lookup(conn: &Connection, item_id: i64) -> Option { + let mut stmt = conn + .prepare(&format!( + "SELECT * FROM {} WHERE item_id = ?", + Self::TABLE_NAME + )) + .ok()?; + stmt.query_one([item_id], |row| { + Ok(ParsedLLMStorageResult { + id: row.get(0)?, + item_id: row.get(1)?, + title: row.get(2)?, + quantity: row.get(3)?, + gigabytes: row.get(4)?, + fail_reason: row.get(5)?, + }) + }) + .ok() + } + + pub fn add_or_update(&self, conn: &Connection) { + let count = conn + .execute( + &format!( + "INSERT OR REPLACE INTO {} + ( + item_id, + title, + quantity, + gigabytes, + fail_reason + ) + VALUES (?1, ?2, ?3, ?4, ?5)", + Self::TABLE_NAME + ), + ( + self.item_id, + &self.title, + self.quantity, + self.gigabytes, + self.fail_reason.clone(), + ), + ) + .unwrap(); + if count != 1 { + panic!("Expected count to be 1 but got {}", count); + } + } +} + pub fn get_initialized(path: Option<&Path>) -> Connection { let conn = match path { Some(p) => Connection::open(&p), @@ -626,6 +721,7 @@ pub fn get_initialized(path: Option<&Path>) -> Connection { ParsedStorage::initialize(&conn); ParsedPage::initialize(&conn); ItemAppearances::initialize(&conn); + ParsedLLMStorageResult::initialize(&conn); conn } @@ -637,6 +733,7 @@ pub struct Stats { rows_parsed_storage: i64, rows_parsed_page: i64, rows_item_appearances: i64, + // pub rows_parsed_storage_llm: i64, } pub fn get_stats(conn: &Connection) -> Stats { @@ -646,6 +743,7 @@ pub fn get_stats(conn: &Connection) -> Stats { rows_parsed_storage: ParsedStorage::get_count(conn), rows_parsed_page: ParsedPage::get_count(conn), rows_item_appearances: ItemAppearances::get_count(conn), + // rows_parsed_storage_llm: ParsedLLMStorageResult::get_count(conn), } } @@ -706,6 +804,20 @@ mod tests { vec![apperance] ); + let parsedllmstorage = ParsedLLMStorageResult { + fail_reason: "Some reason".to_owned(), + gigabytes: 12, + id: 1, + item_id: 12345, + quantity: 32, + title: "Some Title".to_owned(), + }; + parsedllmstorage.add_or_update(&db); + assert_eq!( + ParsedLLMStorageResult::lookup(&db, parsedllmstorage.item_id), + Some(parsedllmstorage) + ); + assert_eq!(Listing::lookup_since(&db, page.timestamp, 3), vec![listing]); assert_eq!( Listing::lookup_since(&db, page.timestamp + chrono::Duration::seconds(1), 3), diff --git a/src/lib.rs b/src/lib.rs index 92ecfdf..5e7e677 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ pub mod db; pub mod parser; pub mod parser_ebay; -pub mod parser_storage; +pub mod parser_storage_e0; +pub mod parser_storage_e1; diff --git a/src/main.rs b/src/main.rs index a214a6a..0c588b7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,16 +2,16 @@ use actix_web::{App, HttpServer, Responder, Result, get, post, web, web::Data}; use chrono::{DateTime, Utc}; use clap::Parser; use ebay_scraper_rust::db::{ - DBTable, ItemAppearances, Listing, ParsedPage, ParsedStorage, SearchURL, get_initialized, - get_stats, listings_get_filtered, + DBTable, ItemAppearances, Listing, ParsedLLMStorageResult, ParsedPage, ParsedStorage, + SearchURL, get_initialized, get_stats, listings_get_filtered, }; use ebay_scraper_rust::parser::parse_dir; -use ebay_scraper_rust::parser_storage; +use ebay_scraper_rust::{parser_storage_e0, parser_storage_e1}; use serde::{Deserialize, Serialize}; use std::path::PathBuf; use std::sync::Mutex; use std::time::Instant; -use tracing::{info, instrument}; +use tracing::{error, info, instrument}; use tracing_subscriber::filter::EnvFilter; use tracing_subscriber::fmt; @@ -28,6 +28,12 @@ mod xdg_dirs; )] struct Args {} +struct AppCtx { + db: rusqlite::Connection, + db_llm: rusqlite::Connection, + download_dir: PathBuf, +} + #[derive(Deserialize, Debug)] struct ListingsFilter { since: Option, @@ -37,12 +43,12 @@ struct ListingsFilter { #[get("/listings")] async fn listings_filtered_get( - db: Data>, + ctx: Data>, filter: web::Query, ) -> Result { let start = Instant::now(); let res = listings_get_filtered( - &db.lock().unwrap(), + &ctx.lock().unwrap().db, &DateTime::::from_timestamp(filter.since.unwrap_or(0), 0).unwrap(), filter.limit.unwrap_or(1_000), filter.cents_per_tbytes_max.unwrap_or(100_00), @@ -57,19 +63,16 @@ async fn listings_filtered_get( } #[get("/listing/{id}")] -async fn listing_get( - db: Data>, - id: web::Path, -) -> Result { - Ok(web::Json(Listing::lookup(&db.lock().unwrap(), *id))) +async fn listing_get(ctx: Data>, id: web::Path) -> Result { + Ok(web::Json(Listing::lookup(&ctx.lock().unwrap().db, *id))) } #[get("/listing/{id}/parsed")] -async fn listing_parse_get( - db: Data>, - id: web::Path, -) -> Result { - Ok(web::Json(ParsedStorage::lookup(&db.lock().unwrap(), *id))) +async fn listing_parse_get(ctx: Data>, id: web::Path) -> Result { + Ok(web::Json(ParsedStorage::lookup( + &ctx.lock().unwrap().db, + *id, + ))) } #[derive(Serialize)] @@ -80,10 +83,10 @@ struct APIHistory { #[get("/listing/{id}/history")] async fn listing_history_get( - db: Data>, + ctx: Data>, id: web::Path, ) -> Result { - let history: Vec<_> = ItemAppearances::lookup(&db.lock().unwrap(), *id) + let history: Vec<_> = ItemAppearances::lookup(&ctx.lock().unwrap().db, *id) .iter() // .inspect(|e| info!("got: {:?}", e)) .filter_map(|e| { @@ -97,35 +100,52 @@ async fn listing_history_get( } #[post("/listing/parse")] -async fn parse_listings(db: Data>) -> Result { - let mut cnt = 0; - let db_unlocked = db.lock().unwrap(); - Listing::lookup_non_parsed(&db_unlocked) +async fn parse_listings(ctx: Data>) -> Result { + // Lets grab a few entries and then try parsing them with two engines. + let ctx_locked = ctx.lock().unwrap(); + let entries: Vec<_> = Listing::lookup_non_parsed(&ctx_locked.db) .iter() - .map(|l| parser_storage::parse_size_and_quantity(l.0, &l.1)) - .inspect(|_| cnt = cnt + 1) - .for_each(|ps| ps.add_or_update(&db_unlocked)); + .take(10) + .map(|e| e.clone()) + .collect(); + for (item_id, title) in &entries { + let ps0 = parser_storage_e0::parse_size_and_quantity(*item_id, &title); + ps0.add_or_update(&ctx_locked.db); - Ok(web::Json(cnt)) + let ps1 = + parser_storage_e1::parse_size_and_quantity(&ctx_locked.db_llm, *item_id, &title).await; + if ps1.is_some() { + info!( + "Parsed using an LLM title:{} and results:{:?}", + title, + ps1.unwrap() + ); + ps1.unwrap().add_or_update(&ctx_locked.db); + ps1.unwrap().add_or_update(&ctx_locked.db_llm); + } else { + error!("Failed to parse {item_id} with title {title}"); + } + } + Ok(web::Json(entries.len())) } #[get("/category")] -async fn category_getnames(db: Data>) -> Result { - Ok(web::Json(SearchURL::names(&db.lock().unwrap()))) +async fn category_getnames(ctx: Data>) -> Result { + Ok(web::Json(SearchURL::names(&ctx.lock().unwrap().db))) } #[post("/category/{category}/parse")] #[instrument(skip_all)] async fn category_parse( - db: Data>, - downloaddir: Data, + ctx: Data>, category: web::Path, ) -> Result { let start = Instant::now(); + let ctx_unlocked = ctx.lock().unwrap(); let count = parse_dir( - &downloaddir.join(category.clone()), + &ctx_unlocked.download_dir.join(category.clone()), &category, - &db.lock().unwrap(), + &ctx_unlocked.db, ) .unwrap(); let elapsed = start.elapsed().as_micros() as f64 / 1000.0; @@ -135,19 +155,22 @@ async fn category_parse( } #[get("/stats")] -async fn stats_get(db: Data>) -> Result { - Ok(web::Json(get_stats(&db.lock().unwrap()))) +async fn stats_get(ctx: Data>) -> Result { + let stats_db = get_stats(&ctx.lock().unwrap().db); + // let stats_db_llm = get_stats(&ctx.lock().unwrap().db_llm); + // stats_db.rows_parsed_storage_llm = stats_db_llm.rows_parsed_storage_llm; + Ok(web::Json(stats_db)) } #[get("/admin")] -async fn admin_get(db: Data>) -> Result { - let db = db.lock().unwrap(); +async fn admin_get(ctx: Data>) -> Result { + let ctx_locked = ctx.lock().unwrap(); let query_start_time = Instant::now(); - let search_urls = SearchURL::get_all(&db).unwrap_or_default(); - let parsed_pages = ParsedPage::get_all(&db).unwrap_or_default(); - let parsed_storages = ParsedStorage::get_all(&db).unwrap_or_default(); - let item_appearances = ItemAppearances::get_all(&db).unwrap_or_default(); - let listings = Listing::get_all(&db).unwrap_or_default(); + let search_urls = SearchURL::get_all(&ctx_locked.db).unwrap_or_default(); + let parsed_pages = ParsedPage::get_all(&ctx_locked.db).unwrap_or_default(); + let parsed_storages = ParsedStorage::get_all(&ctx_locked.db).unwrap_or_default(); + let item_appearances = ItemAppearances::get_all(&ctx_locked.db).unwrap_or_default(); + let listings = Listing::get_all(&ctx_locked.db).unwrap_or_default(); let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0; let html_gen_start_time = Instant::now(); @@ -287,10 +310,20 @@ async fn main() -> std::io::Result<()> { "Starting with scraped data dir of \"{}\".", scrapedatadir.to_str().unwrap() ); - let db_mutex = Data::new(Mutex::new(get_initialized(None))); + + let app_data = Data::new(Mutex::new(AppCtx { + download_dir: scrapedatadir.clone(), + db: get_initialized(None), + db_llm: { + let db_path = scrapedatadir.with_file_name("llm.sqlite"); + let db = rusqlite::Connection::open(db_path).unwrap(); + ParsedLLMStorageResult::initialize(&db); + db + }, + })); // Prepare our backend via pulling in what catagories we are preconfigured with. - SearchURL::scan(&db_mutex.lock().unwrap(), &scrapedatadir, "url.json"); + SearchURL::scan(&app_data.lock().unwrap().db, &scrapedatadir, "url.json"); HttpServer::new(move || { App::new() @@ -306,8 +339,7 @@ async fn main() -> std::io::Result<()> { .service(admin_get) .service(stats_get) // Stuff which is passed into every request. - .app_data(db_mutex.clone()) - .app_data(Data::new(scrapedatadir.clone())) + .app_data(app_data.clone()) }) .bind(("0.0.0.0", 9876))? .run() diff --git a/src/parser_storage.rs b/src/parser_storage_e0.rs similarity index 100% rename from src/parser_storage.rs rename to src/parser_storage_e0.rs diff --git a/src/parser_storage_e1.rs b/src/parser_storage_e1.rs new file mode 100644 index 0000000..f920e55 --- /dev/null +++ b/src/parser_storage_e1.rs @@ -0,0 +1,105 @@ +use crate::db::ParsedLLMStorageResult; +use crate::db::ParsedStorage; +use actix_web::mime::APPLICATION_JSON; +use reqwest::header::AUTHORIZATION; +use reqwest::header::CONTENT_TYPE; +use serde::{Deserialize, Serialize}; +use serde_json::json; + +// Given this prompt and a string of "(Lot of 6) Samsung MZ-VLB2560 256GB M.2NVMe Internal SSD +// (MZVLB256HBHQ-000H1)", we get 338 input tokens and 36 output tokens. Assuming no caching, then +// Gemini 2.5 Flash Lite at $0.10/M input and $0.40/M output, this would cost $0.0000338 Input, +// $0.0000144 Output, and $0.0000482 Total. Given 30,000 listings this would be $1.446. +const SYSTEM_PROMPT: &str = r#" +You will be given a product listing for one or more storage drives. You will return *ONLY* JSON strictly adhering to the same structure and key names as below. This means no backticks or markdown/markup. You will specify how many storage drives are included in the listing as a number (1, 2, 3, etc), the size in gigabytes of each drive as a number (rounding up if needed, so 1, 2, 3, etc), and lastly if the above cannot be provided due the the listing title being incomplete or confusing, a very short reason why. + +Here is an example for a title of "Lot of 2, Crucial P3 Plus 1TB NVMe Internal M.2 SSD (CT1000P3PSSD8) - Barely used!"; +{ + "quantity": 2, + "gigabytes": 1024 + "fail_reason": "" +} + +And an example for an unclear title of "Pallet of assorted 128GB to 5TB drives"; +{ + "quantity": 0, + "gigabytes": 0, + "fail_reason": "multiple mixed sizes" +} +"#; + +fn create_request(title: &str) -> serde_json::Value { + json!({ + "model": "gemini-2.5-flash-lite", + "messages": [ + { + "role": "system", + "content": SYSTEM_PROMPT + }, + { + "role": "user", + "content": title + } + ] + }) +} + +#[derive(Deserialize, Serialize, Debug, PartialEq, Clone)] +struct LLMParsedResponse { + pub quantity: i64, + pub gigabytes: i64, + pub fail_reason: String, +} + +#[derive(Deserialize, Debug)] +struct OpenAIResponse { + choices: Vec, +} + +#[derive(Deserialize, Debug)] +struct OpenAIChoice { + message: OpenAIMessage, +} + +#[derive(Deserialize, Debug)] +struct OpenAIMessage { + content: String, +} + +/// Parses size and quantity information from an item title. +pub async fn parse_size_and_quantity( + db: &rusqlite::Connection, + item_id: i64, + title: &str, +) -> Option { + let client = reqwest::Client::new(); + let req = client + .post("https://ai.hak8or.com/litellm_api/chat/completions") + .header(CONTENT_TYPE, APPLICATION_JSON.to_string()) + .header(AUTHORIZATION, "Bearer sk-HMGML94x2ag6ggOoDghSGA") + .body(create_request(title).to_string()); + let reply_body = req.send().await.ok()?.text().await.ok()?; + let repl_json: OpenAIResponse = serde_json::from_str(&reply_body).ok()?; + let reply_parsed_storage_json: LLMParsedResponse = + serde_json::from_str(&repl_json.choices[0].message.content).ok()?; + + let plsr = ParsedLLMStorageResult { + id: 0, + fail_reason: reply_parsed_storage_json.fail_reason.clone(), + gigabytes: reply_parsed_storage_json.gigabytes, + item_id, + quantity: reply_parsed_storage_json.quantity, + title: title.to_owned(), + }; + plsr.add_or_update(&db); + + Some(ParsedStorage { + id: 0, + item: item_id, + total_gigabytes: reply_parsed_storage_json.quantity * reply_parsed_storage_json.gigabytes, + quantity: reply_parsed_storage_json.quantity, + individual_size_gigabytes: reply_parsed_storage_json.gigabytes, + needed_description_check: !reply_parsed_storage_json.fail_reason.is_empty(), + parse_engine: 1, + }) +}