Compare commits
1 Commits
parallel_s
...
main
Author | SHA1 | Date | |
---|---|---|---|
b538dd8012 |
46
Cargo.lock
generated
46
Cargo.lock
generated
@@ -680,7 +680,6 @@ dependencies = [
|
|||||||
"clap",
|
"clap",
|
||||||
"dirs",
|
"dirs",
|
||||||
"lazy_static",
|
"lazy_static",
|
||||||
"liblzma",
|
|
||||||
"rayon",
|
"rayon",
|
||||||
"regex",
|
"regex",
|
||||||
"rusqlite",
|
"rusqlite",
|
||||||
@@ -688,10 +687,10 @@ dependencies = [
|
|||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
"similar-asserts",
|
"similar-asserts",
|
||||||
"strum",
|
|
||||||
"test-log",
|
"test-log",
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-subscriber",
|
"tracing-subscriber",
|
||||||
|
"zstd",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@@ -1163,26 +1162,6 @@ version = "0.2.174"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776"
|
checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "liblzma"
|
|
||||||
version = "0.4.2"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "0791ab7e08ccc8e0ce893f6906eb2703ed8739d8e89b57c0714e71bad09024c8"
|
|
||||||
dependencies = [
|
|
||||||
"liblzma-sys",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "liblzma-sys"
|
|
||||||
version = "0.4.4"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "01b9596486f6d60c3bbe644c0e1be1aa6ccc472ad630fe8927b456973d7cb736"
|
|
||||||
dependencies = [
|
|
||||||
"cc",
|
|
||||||
"libc",
|
|
||||||
"pkg-config",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "libredox"
|
name = "libredox"
|
||||||
version = "0.1.3"
|
version = "0.1.3"
|
||||||
@@ -1913,29 +1892,6 @@ version = "0.11.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "strum"
|
|
||||||
version = "0.27.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "f64def088c51c9510a8579e3c5d67c65349dcf755e5479ad3d010aa6454e2c32"
|
|
||||||
dependencies = [
|
|
||||||
"phf",
|
|
||||||
"strum_macros",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "strum_macros"
|
|
||||||
version = "0.27.1"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "c77a8c5abcaf0f9ce05d62342b7d298c346515365c36b673df4ebe3ced01fde8"
|
|
||||||
dependencies = [
|
|
||||||
"heck",
|
|
||||||
"proc-macro2",
|
|
||||||
"quote",
|
|
||||||
"rustversion",
|
|
||||||
"syn",
|
|
||||||
]
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "syn"
|
name = "syn"
|
||||||
version = "2.0.103"
|
version = "2.0.103"
|
||||||
|
@@ -9,17 +9,16 @@ chrono = { version = "0.4.41", features = ["serde"] }
|
|||||||
clap = { version = "4.5.40", features = ["derive"] }
|
clap = { version = "4.5.40", features = ["derive"] }
|
||||||
dirs = "6.0.0"
|
dirs = "6.0.0"
|
||||||
lazy_static = "1.5.0"
|
lazy_static = "1.5.0"
|
||||||
liblzma = "0.4.2"
|
|
||||||
rayon = "1.10.0"
|
rayon = "1.10.0"
|
||||||
regex = "1.11.1"
|
regex = "1.11.1"
|
||||||
rusqlite = { version = "0.36.0", features = ["bundled", "chrono"] }
|
rusqlite = { version = "0.36.0", features = ["bundled", "chrono"] }
|
||||||
scraper = "0.23.1"
|
scraper = "0.23.1"
|
||||||
serde = { version = "1.0.219", features = ["derive"] }
|
serde = { version = "1.0.219", features = ["derive"] }
|
||||||
serde_json = "1.0.140"
|
serde_json = "1.0.140"
|
||||||
strum = { version = "0.27.1", features = ["std", "derive", "phf", "strum_macros"] }
|
|
||||||
test-log = { version = "0.2.17", features = ["trace"] }
|
test-log = { version = "0.2.17", features = ["trace"] }
|
||||||
tracing = { version = "0.1.41", features = ["attributes"] }
|
tracing = { version = "0.1.41", features = ["attributes"] }
|
||||||
tracing-subscriber = { version = "0.3.19", features = ["env-filter"] }
|
tracing-subscriber = { version = "0.3.19", features = ["env-filter"] }
|
||||||
|
zstd = "0.13.3"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
similar-asserts = "1.7.0"
|
similar-asserts = "1.7.0"
|
||||||
|
354
readme.md
354
readme.md
@@ -3,352 +3,10 @@
|
|||||||
This is a dumb little tool which ingests raw HTML files, does some parsing on them, and serves the results over a web API.
|
This is a dumb little tool which ingests raw HTML files, does some parsing on them, and serves the results over a web API.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export URL_BASE="localhost:9876"; \
|
export URL_BASE="http://scraper.homelab.hak8or.com:8080"; \
|
||||||
echo run0 && http POST "$URL_BASE/category/ssd/discover" && \
|
echo run0 && http POST "$URL_BASE/page/parse/ssd" && \
|
||||||
echo run1 && http POST "$URL_BASE/category/ssd/parse" && \
|
echo run1 && http POST "$URL_BASE/listing/parse" && \
|
||||||
echo run2 && http GET "$URL_BASE/category/ssd/parse" && \
|
echo run2 && http GET "$URL_BASE/listing/since/12345678/2" && \
|
||||||
echo run3 && http POST "$URL_BASE/listing/parse" && \
|
echo run3 && http GET "$URL_BASE/listing/388484391867" && \
|
||||||
echo run4 && http GET "$URL_BASE/listings" since:=10099 limit:=10 cents_per_tbytes_max:=900 && \
|
echo run4 && http GET "$URL_BASE/listing/286605201240/history"
|
||||||
echo run5 && http GET "$URL_BASE/listing/267267322597" && \
|
|
||||||
echo run6 && http GET "$URL_BASE/listing/267267322597/history" &&
|
|
||||||
echo run7 && http GET "$URL_BASE/listing/267267322597/parsed"
|
|
||||||
```
|
|
||||||
|
|
||||||
```
|
|
||||||
run0
|
|
||||||
HTTP/1.1 200 OK
|
|
||||||
content-length: 0
|
|
||||||
content-type: text/plain; charset=utf-8
|
|
||||||
date: Thu, 10 Jul 2025 04:26:49 GMT
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
run1
|
|
||||||
HTTP/1.1 200 OK
|
|
||||||
content-length: 0
|
|
||||||
content-type: text/plain; charset=utf-8
|
|
||||||
date: Thu, 10 Jul 2025 04:26:49 GMT
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
run2
|
|
||||||
HTTP/1.1 200 OK
|
|
||||||
content-length: 36
|
|
||||||
content-type: application/json
|
|
||||||
date: Thu, 10 Jul 2025 04:26:49 GMT
|
|
||||||
|
|
||||||
[
|
|
||||||
[
|
|
||||||
"PendingParse",
|
|
||||||
1,
|
|
||||||
1
|
|
||||||
],
|
|
||||||
[
|
|
||||||
"Ready",
|
|
||||||
0,
|
|
||||||
1
|
|
||||||
]
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
run3
|
|
||||||
HTTP/1.1 200 OK
|
|
||||||
content-length: 2
|
|
||||||
content-type: application/json
|
|
||||||
date: Thu, 10 Jul 2025 04:26:49 GMT
|
|
||||||
|
|
||||||
62
|
|
||||||
|
|
||||||
|
|
||||||
run4
|
|
||||||
HTTP/1.1 200 OK
|
|
||||||
content-length: 4232
|
|
||||||
content-type: application/json
|
|
||||||
date: Thu, 10 Jul 2025 04:26:49 GMT
|
|
||||||
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"history": [
|
|
||||||
{
|
|
||||||
"category": "ssd",
|
|
||||||
"current_bid_usd_cents": 1260,
|
|
||||||
"item": 286605201240,
|
|
||||||
"timestamp": "2025-06-19T21:44:23Z"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"listing": {
|
|
||||||
"buy_it_now_price_cents": null,
|
|
||||||
"has_best_offer": true,
|
|
||||||
"id": 5,
|
|
||||||
"image_url": "https://i.ebayimg.com/images/g/3NoAAeSwPrtoDb1O/s-l500.webp",
|
|
||||||
"item_id": 286605201240,
|
|
||||||
"title": "Fanxiang M.2 SSD 1TB NVMe PCIe Gen 3x 4 M2 Internal Solid State Drive 3500MB/s"
|
|
||||||
},
|
|
||||||
"parsed": [
|
|
||||||
{
|
|
||||||
"id": 5,
|
|
||||||
"individual_size_gigabytes": 1024,
|
|
||||||
"item": 286605201240,
|
|
||||||
"needed_description_check": false,
|
|
||||||
"parse_engine": 0,
|
|
||||||
"quantity": 1,
|
|
||||||
"total_gigabytes": 1024
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"history": [
|
|
||||||
{
|
|
||||||
"category": "ssd",
|
|
||||||
"current_bid_usd_cents": 2400,
|
|
||||||
"item": 177133381123,
|
|
||||||
"timestamp": "2025-06-19T21:44:23Z"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"listing": {
|
|
||||||
"buy_it_now_price_cents": null,
|
|
||||||
"has_best_offer": false,
|
|
||||||
"id": 22,
|
|
||||||
"image_url": "https://i.ebayimg.com/images/g/-VMAAOSwaX1oNyx4/s-l500.webp",
|
|
||||||
"item_id": 177133381123,
|
|
||||||
"title": "SanDisk professional G-DRIVE SSD 2TB, A+ condition"
|
|
||||||
},
|
|
||||||
"parsed": [
|
|
||||||
{
|
|
||||||
"id": 22,
|
|
||||||
"individual_size_gigabytes": 2048,
|
|
||||||
"item": 177133381123,
|
|
||||||
"needed_description_check": false,
|
|
||||||
"parse_engine": 0,
|
|
||||||
"quantity": 1,
|
|
||||||
"total_gigabytes": 2048
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"history": [
|
|
||||||
{
|
|
||||||
"category": "ssd",
|
|
||||||
"current_bid_usd_cents": 3108,
|
|
||||||
"item": 187263467837,
|
|
||||||
"timestamp": "2025-06-19T21:44:23Z"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"listing": {
|
|
||||||
"buy_it_now_price_cents": null,
|
|
||||||
"has_best_offer": false,
|
|
||||||
"id": 35,
|
|
||||||
"image_url": "https://i.ebayimg.com/images/g/hn8AAOSw1hJoNrJm/s-l500.webp",
|
|
||||||
"item_id": 187263467837,
|
|
||||||
"title": "Used Fanxiang S880 4TB SSD NVME M.2 SSD PCIe 4x4 7300MBS Solid State Drive"
|
|
||||||
},
|
|
||||||
"parsed": [
|
|
||||||
{
|
|
||||||
"id": 35,
|
|
||||||
"individual_size_gigabytes": 4096,
|
|
||||||
"item": 187263467837,
|
|
||||||
"needed_description_check": false,
|
|
||||||
"parse_engine": 0,
|
|
||||||
"quantity": 1,
|
|
||||||
"total_gigabytes": 4096
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"history": [
|
|
||||||
{
|
|
||||||
"category": "ssd",
|
|
||||||
"current_bid_usd_cents": 1000,
|
|
||||||
"item": 267267367821,
|
|
||||||
"timestamp": "2025-06-19T21:44:23Z"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"listing": {
|
|
||||||
"buy_it_now_price_cents": null,
|
|
||||||
"has_best_offer": false,
|
|
||||||
"id": 37,
|
|
||||||
"image_url": "https://i.ebayimg.com/images/g/Cr8AAOSwXY1oN6m8/s-l500.webp",
|
|
||||||
"item_id": 267267367821,
|
|
||||||
"title": "(Lot of 6) Samsung MZ-VLB2560 256GB M.2 NVMe Internal SSD (MZVLB256HBHQ-000H1)"
|
|
||||||
},
|
|
||||||
"parsed": [
|
|
||||||
{
|
|
||||||
"id": 37,
|
|
||||||
"individual_size_gigabytes": 256,
|
|
||||||
"item": 267267367821,
|
|
||||||
"needed_description_check": false,
|
|
||||||
"parse_engine": 0,
|
|
||||||
"quantity": 6,
|
|
||||||
"total_gigabytes": 1536
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"history": [
|
|
||||||
{
|
|
||||||
"category": "ssd",
|
|
||||||
"current_bid_usd_cents": 4600,
|
|
||||||
"item": 187263491149,
|
|
||||||
"timestamp": "2025-06-19T21:44:23Z"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"listing": {
|
|
||||||
"buy_it_now_price_cents": null,
|
|
||||||
"has_best_offer": false,
|
|
||||||
"id": 44,
|
|
||||||
"image_url": "https://i.ebayimg.com/images/g/v2EAAOSwg9poNrTr/s-l500.webp",
|
|
||||||
"item_id": 187263491149,
|
|
||||||
"title": "Used Silicon Power 4TB US75 Nvme PCIe Gen4x4 M.2 2280 SSD R/W Up to 7000/6500 MB"
|
|
||||||
},
|
|
||||||
"parsed": [
|
|
||||||
{
|
|
||||||
"id": 44,
|
|
||||||
"individual_size_gigabytes": 4096,
|
|
||||||
"item": 187263491149,
|
|
||||||
"needed_description_check": false,
|
|
||||||
"parse_engine": 0,
|
|
||||||
"quantity": 1,
|
|
||||||
"total_gigabytes": 4096
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"history": [
|
|
||||||
{
|
|
||||||
"category": "ssd",
|
|
||||||
"current_bid_usd_cents": 1000,
|
|
||||||
"item": 267267351339,
|
|
||||||
"timestamp": "2025-06-19T21:44:23Z"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"listing": {
|
|
||||||
"buy_it_now_price_cents": null,
|
|
||||||
"has_best_offer": false,
|
|
||||||
"id": 46,
|
|
||||||
"image_url": "https://i.ebayimg.com/images/g/z8EAAOSwyKZoN6TW/s-l500.webp",
|
|
||||||
"item_id": 267267351339,
|
|
||||||
"title": "(Lot of 6) Used -Micron MTFDDAV256TBN 256GB, M.2 2280 Solid State Drive"
|
|
||||||
},
|
|
||||||
"parsed": [
|
|
||||||
{
|
|
||||||
"id": 46,
|
|
||||||
"individual_size_gigabytes": 256,
|
|
||||||
"item": 267267351339,
|
|
||||||
"needed_description_check": false,
|
|
||||||
"parse_engine": 0,
|
|
||||||
"quantity": 6,
|
|
||||||
"total_gigabytes": 1536
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"history": [
|
|
||||||
{
|
|
||||||
"category": "ssd",
|
|
||||||
"current_bid_usd_cents": 99,
|
|
||||||
"item": 306325087069,
|
|
||||||
"timestamp": "2025-06-19T21:44:23Z"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"listing": {
|
|
||||||
"buy_it_now_price_cents": null,
|
|
||||||
"has_best_offer": false,
|
|
||||||
"id": 59,
|
|
||||||
"image_url": "https://i.ebayimg.com/images/g/zuUAAOSwIoJoN5yC/s-l500.webp",
|
|
||||||
"item_id": 306325087069,
|
|
||||||
"title": "T298 ~ HP OEM Desktop Z240 Workstation Heatsink w NVMe M.2 256GB SSD 826414-001"
|
|
||||||
},
|
|
||||||
"parsed": [
|
|
||||||
{
|
|
||||||
"id": 59,
|
|
||||||
"individual_size_gigabytes": 256,
|
|
||||||
"item": 306325087069,
|
|
||||||
"needed_description_check": false,
|
|
||||||
"parse_engine": 0,
|
|
||||||
"quantity": 1,
|
|
||||||
"total_gigabytes": 256
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"history": [
|
|
||||||
{
|
|
||||||
"category": "ssd",
|
|
||||||
"current_bid_usd_cents": 1000,
|
|
||||||
"item": 267267322597,
|
|
||||||
"timestamp": "2025-06-19T21:44:23Z"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"listing": {
|
|
||||||
"buy_it_now_price_cents": null,
|
|
||||||
"has_best_offer": false,
|
|
||||||
"id": 60,
|
|
||||||
"image_url": "https://i.ebayimg.com/images/g/r8YAAOSwlkdoN5uW/s-l500.webp",
|
|
||||||
"item_id": 267267322597,
|
|
||||||
"title": "(Lot of 5) Used - Micro 1100 256GB SATA III 2.5\" SSD MTFDDAK256TBN"
|
|
||||||
},
|
|
||||||
"parsed": [
|
|
||||||
{
|
|
||||||
"id": 60,
|
|
||||||
"individual_size_gigabytes": 256,
|
|
||||||
"item": 267267322597,
|
|
||||||
"needed_description_check": false,
|
|
||||||
"parse_engine": 0,
|
|
||||||
"quantity": 5,
|
|
||||||
"total_gigabytes": 1280
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
run5
|
|
||||||
HTTP/1.1 200 OK
|
|
||||||
content-length: 237
|
|
||||||
content-type: application/json
|
|
||||||
date: Thu, 10 Jul 2025 04:26:49 GMT
|
|
||||||
|
|
||||||
{
|
|
||||||
"buy_it_now_price_cents": null,
|
|
||||||
"has_best_offer": false,
|
|
||||||
"id": 60,
|
|
||||||
"image_url": "https://i.ebayimg.com/images/g/r8YAAOSwlkdoN5uW/s-l500.webp",
|
|
||||||
"item_id": 267267322597,
|
|
||||||
"title": "(Lot of 5) Used - Micro 1100 256GB SATA III 2.5\" SSD MTFDDAK256TBN"
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
run6
|
|
||||||
HTTP/1.1 200 OK
|
|
||||||
content-length: 62
|
|
||||||
content-type: application/json
|
|
||||||
date: Thu, 10 Jul 2025 04:26:50 GMT
|
|
||||||
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"current_bid_usd_cents": 1000,
|
|
||||||
"when": "2025-06-19T21:44:23Z"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
run7
|
|
||||||
HTTP/1.1 200 OK
|
|
||||||
content-length: 149
|
|
||||||
content-type: application/json
|
|
||||||
date: Thu, 10 Jul 2025 04:26:50 GMT
|
|
||||||
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"id": 60,
|
|
||||||
"individual_size_gigabytes": 256,
|
|
||||||
"item": 267267322597,
|
|
||||||
"needed_description_check": false,
|
|
||||||
"parse_engine": 0,
|
|
||||||
"quantity": 5,
|
|
||||||
"total_gigabytes": 1280
|
|
||||||
}
|
|
||||||
]
|
|
||||||
```
|
```
|
||||||
|
128
src/db.rs
128
src/db.rs
@@ -3,8 +3,6 @@ use rusqlite::Connection;
|
|||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use strum::{EnumIter, IntoEnumIterator};
|
|
||||||
// use strum_macros::EnumIter;
|
|
||||||
use tracing::{error, info};
|
use tracing::{error, info};
|
||||||
|
|
||||||
pub trait DBTable {
|
pub trait DBTable {
|
||||||
@@ -142,54 +140,30 @@ impl SearchURL {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Debug, PartialEq, Clone, EnumIter)]
|
|
||||||
pub enum ParsedPageStatus {
|
|
||||||
PendingParse,
|
|
||||||
Ready,
|
|
||||||
}
|
|
||||||
impl TryFrom<i64> for ParsedPageStatus {
|
|
||||||
type Error = rusqlite::Error;
|
|
||||||
|
|
||||||
fn try_from(value: i64) -> Result<Self, Self::Error> {
|
|
||||||
match value {
|
|
||||||
0 => Ok(ParsedPageStatus::PendingParse),
|
|
||||||
1 => Ok(ParsedPageStatus::Ready),
|
|
||||||
_ => Err(rusqlite::Error::InvalidColumnType(
|
|
||||||
2,
|
|
||||||
"Invalid integer of {} for ParsedPageStatus".to_string(),
|
|
||||||
rusqlite::types::Type::Integer,
|
|
||||||
)),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Debug, PartialEq, Clone)]
|
#[derive(Serialize, Debug, PartialEq, Clone)]
|
||||||
pub struct Page {
|
pub struct ParsedPage {
|
||||||
pub timestamp: DateTime<Utc>,
|
pub timestamp: DateTime<Utc>,
|
||||||
pub category: String,
|
pub category: String,
|
||||||
pub status: ParsedPageStatus,
|
|
||||||
}
|
}
|
||||||
impl DBTable for Page {
|
impl DBTable for ParsedPage {
|
||||||
const TABLE_NAME: &'static str = "Pages";
|
const TABLE_NAME: &'static str = "Pages_Parsed";
|
||||||
const TABLE_SCHEMA: &'static str = "
|
const TABLE_SCHEMA: &'static str = "
|
||||||
id INTEGER PRIMARY KEY,
|
id INTEGER PRIMARY KEY,
|
||||||
category TEXT NOT NULL,
|
category TEXT NOT NULL,
|
||||||
timestamp INTEGER NOT NULL,
|
timestamp INTEGER NOT NULL,
|
||||||
status INTEGER NOT NULL,
|
|
||||||
UNIQUE(category, timestamp)
|
UNIQUE(category, timestamp)
|
||||||
FOREIGN KEY(category) REFERENCES SearchURLs(name)
|
FOREIGN KEY(category) REFERENCES SearchURLs(name)
|
||||||
";
|
";
|
||||||
|
|
||||||
fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> {
|
fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> {
|
||||||
let mut stmt = conn.prepare(&format!(
|
let mut stmt = conn.prepare(&format!(
|
||||||
"SELECT category, timestamp, status FROM {}",
|
"SELECT category, timestamp FROM {}",
|
||||||
Self::TABLE_NAME
|
Self::TABLE_NAME
|
||||||
))?;
|
))?;
|
||||||
let iter = stmt.query_map([], |row| {
|
let iter = stmt.query_map([], |row| {
|
||||||
Ok(Page {
|
Ok(ParsedPage {
|
||||||
category: row.get(0)?,
|
category: row.get(0)?,
|
||||||
timestamp: row.get(1)?,
|
timestamp: row.get(1)?,
|
||||||
status: row.get::<_, i64>(2)?.try_into().unwrap(),
|
|
||||||
})
|
})
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
@@ -200,7 +174,7 @@ impl DBTable for Page {
|
|||||||
Ok(result)
|
Ok(result)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
impl Page {
|
impl ParsedPage {
|
||||||
pub fn lookup(conn: &Connection, timestamp: DateTime<Utc>) -> Option<Self> {
|
pub fn lookup(conn: &Connection, timestamp: DateTime<Utc>) -> Option<Self> {
|
||||||
let mut stmt = conn
|
let mut stmt = conn
|
||||||
.prepare(&format!(
|
.prepare(&format!(
|
||||||
@@ -209,11 +183,10 @@ impl Page {
|
|||||||
))
|
))
|
||||||
.ok()?;
|
.ok()?;
|
||||||
stmt.query_one([timestamp], |row| {
|
stmt.query_one([timestamp], |row| {
|
||||||
Ok(Page {
|
Ok(ParsedPage {
|
||||||
// id: row.get(0)?,
|
// id: row.get(0)?,
|
||||||
category: row.get(1)?,
|
category: row.get(1)?,
|
||||||
timestamp: row.get(2)?,
|
timestamp: row.get(2)?,
|
||||||
status: row.get::<_, i64>(3)?.try_into().unwrap(),
|
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
.ok()
|
.ok()
|
||||||
@@ -223,70 +196,13 @@ impl Page {
|
|||||||
let _ = conn
|
let _ = conn
|
||||||
.execute(
|
.execute(
|
||||||
&format!(
|
&format!(
|
||||||
"INSERT OR REPLACE INTO {} (category, timestamp, status) VALUES (?1, ?2, ?3)",
|
"INSERT OR REPLACE INTO {} (category, timestamp) VALUES (?1, ?2)",
|
||||||
Self::TABLE_NAME
|
Self::TABLE_NAME
|
||||||
),
|
),
|
||||||
(&self.category, self.timestamp, self.status.clone() as i64),
|
(&self.category, self.timestamp),
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn lookup_status(
|
|
||||||
conn: &Connection,
|
|
||||||
status: ParsedPageStatus,
|
|
||||||
category: &str,
|
|
||||||
max: usize,
|
|
||||||
) -> Vec<Self> {
|
|
||||||
let mut stmt = conn
|
|
||||||
.prepare(&format!(
|
|
||||||
"SELECT category, timestamp, status FROM {} WHERE status = {} AND category = ?1 LIMIT {}",
|
|
||||||
Self::TABLE_NAME,
|
|
||||||
status.clone() as i64,
|
|
||||||
max
|
|
||||||
))
|
|
||||||
.unwrap();
|
|
||||||
stmt.query_map([category], |row| {
|
|
||||||
Ok(Self {
|
|
||||||
category: row.get(0)?,
|
|
||||||
timestamp: row.get(1)?,
|
|
||||||
status: row.get::<_, i64>(2)?.try_into().unwrap(),
|
|
||||||
})
|
|
||||||
})
|
|
||||||
.unwrap()
|
|
||||||
.inspect(|e| info!("debugging saw {:?}", e))
|
|
||||||
.filter_map(|e| e.ok())
|
|
||||||
.collect()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn category_stats(conn: &Connection, category: &str) -> Vec<(ParsedPageStatus, i64, i64)> {
|
|
||||||
let mut res: Vec<(ParsedPageStatus, i64, i64)> = vec![];
|
|
||||||
|
|
||||||
for status in ParsedPageStatus::iter() {
|
|
||||||
let cnt_category_status = conn
|
|
||||||
.prepare(&format!(
|
|
||||||
"SELECT COUNT(*) FROM {} WHERE category = ?1 AND status = {}",
|
|
||||||
Self::TABLE_NAME,
|
|
||||||
status.clone() as i64
|
|
||||||
))
|
|
||||||
.ok()
|
|
||||||
.unwrap()
|
|
||||||
.query_one([category], |r| r.get(0))
|
|
||||||
.inspect_err(|e| error!("Failed to get count due to error\"{:?}\", returning 0", e))
|
|
||||||
.unwrap_or(0);
|
|
||||||
let cnt_category_total = conn
|
|
||||||
.prepare(&format!(
|
|
||||||
"SELECT COUNT(*) FROM {} WHERE category = ?1",
|
|
||||||
Self::TABLE_NAME
|
|
||||||
))
|
|
||||||
.ok()
|
|
||||||
.unwrap()
|
|
||||||
.query_one([category], |r| r.get(0))
|
|
||||||
.inspect_err(|e| error!("Failed to get count due to error\"{:?}\", returning 0", e))
|
|
||||||
.unwrap_or(0);
|
|
||||||
res.push((status, cnt_category_status, cnt_category_total));
|
|
||||||
}
|
|
||||||
res
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Debug, PartialEq, Copy, Clone)]
|
#[derive(Serialize, Debug, PartialEq, Copy, Clone)]
|
||||||
@@ -402,7 +318,7 @@ impl DBTable for ItemAppearances {
|
|||||||
current_bid_usd_cents INTEGER,
|
current_bid_usd_cents INTEGER,
|
||||||
UNIQUE(item, timestamp),
|
UNIQUE(item, timestamp),
|
||||||
FOREIGN KEY(item) REFERENCES Listings(item_id),
|
FOREIGN KEY(item) REFERENCES Listings(item_id),
|
||||||
FOREIGN KEY(category, timestamp) REFERENCES Pages(category, timestamp)
|
FOREIGN KEY(category, timestamp) REFERENCES Pages_Parsed(category, timestamp)
|
||||||
";
|
";
|
||||||
|
|
||||||
fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> {
|
fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> {
|
||||||
@@ -708,7 +624,7 @@ pub fn get_initialized(path: Option<&Path>) -> Connection {
|
|||||||
SearchURL::initialize(&conn);
|
SearchURL::initialize(&conn);
|
||||||
Listing::initialize(&conn);
|
Listing::initialize(&conn);
|
||||||
ParsedStorage::initialize(&conn);
|
ParsedStorage::initialize(&conn);
|
||||||
Page::initialize(&conn);
|
ParsedPage::initialize(&conn);
|
||||||
ItemAppearances::initialize(&conn);
|
ItemAppearances::initialize(&conn);
|
||||||
|
|
||||||
conn
|
conn
|
||||||
@@ -728,7 +644,7 @@ pub fn get_stats(conn: &Connection) -> Stats {
|
|||||||
rows_search_url: SearchURL::get_count(conn),
|
rows_search_url: SearchURL::get_count(conn),
|
||||||
rows_listing: Listing::get_count(conn),
|
rows_listing: Listing::get_count(conn),
|
||||||
rows_parsed_storage: ParsedStorage::get_count(conn),
|
rows_parsed_storage: ParsedStorage::get_count(conn),
|
||||||
rows_parsed_page: Page::get_count(conn),
|
rows_parsed_page: ParsedPage::get_count(conn),
|
||||||
rows_item_appearances: ItemAppearances::get_count(conn),
|
rows_item_appearances: ItemAppearances::get_count(conn),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -771,28 +687,12 @@ mod tests {
|
|||||||
parsed.add_or_update(&db);
|
parsed.add_or_update(&db);
|
||||||
assert_eq!(ParsedStorage::lookup(&db, listing.item_id), vec![parsed]);
|
assert_eq!(ParsedStorage::lookup(&db, listing.item_id), vec![parsed]);
|
||||||
|
|
||||||
let page = Page {
|
let page = ParsedPage {
|
||||||
category: "ssd".to_owned(),
|
category: "ssd".to_owned(),
|
||||||
timestamp: std::time::SystemTime::now().into(),
|
timestamp: std::time::SystemTime::now().into(),
|
||||||
status: ParsedPageStatus::PendingParse,
|
|
||||||
};
|
};
|
||||||
page.add_or_update(&db);
|
page.add_or_update(&db);
|
||||||
assert_eq!(Page::lookup(&db, page.timestamp), Some(page.clone()));
|
assert_eq!(ParsedPage::lookup(&db, page.timestamp), Some(page.clone()));
|
||||||
assert_eq!(
|
|
||||||
Page::lookup_status(&db, ParsedPageStatus::PendingParse, "ssd", 10),
|
|
||||||
vec![page.clone()]
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
Page::lookup_status(&db, ParsedPageStatus::Ready, "ssd", 10),
|
|
||||||
vec![]
|
|
||||||
);
|
|
||||||
assert_eq!(
|
|
||||||
Page::category_stats(&db, "ssd"),
|
|
||||||
vec![
|
|
||||||
(ParsedPageStatus::PendingParse, 1, 1),
|
|
||||||
(ParsedPageStatus::Ready, 0, 1)
|
|
||||||
]
|
|
||||||
);
|
|
||||||
|
|
||||||
let apperance = ItemAppearances {
|
let apperance = ItemAppearances {
|
||||||
item: listing.item_id,
|
item: listing.item_id,
|
||||||
|
@@ -2,4 +2,3 @@ pub mod db;
|
|||||||
pub mod parser;
|
pub mod parser;
|
||||||
pub mod parser_ebay;
|
pub mod parser_ebay;
|
||||||
pub mod parser_storage;
|
pub mod parser_storage;
|
||||||
pub mod xdg_dirs;
|
|
||||||
|
107
src/main.rs
107
src/main.rs
@@ -1,14 +1,12 @@
|
|||||||
use actix_web::{App, HttpServer, Responder, Result, get, post, rt, web, web::Data};
|
use actix_web::{App, HttpServer, Responder, Result, get, post, web, web::Data};
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
use ebay_scraper_rust::db;
|
use ebay_scraper_rust::db::{
|
||||||
use ebay_scraper_rust::db::DBTable;
|
DBTable, ItemAppearances, Listing, ParsedPage, ParsedStorage, SearchURL, get_initialized,
|
||||||
use ebay_scraper_rust::db::Page;
|
get_stats, listings_get_filtered,
|
||||||
use ebay_scraper_rust::parser;
|
};
|
||||||
|
use ebay_scraper_rust::parser::parse_dir;
|
||||||
use ebay_scraper_rust::parser_storage;
|
use ebay_scraper_rust::parser_storage;
|
||||||
use ebay_scraper_rust::xdg_dirs;
|
|
||||||
// use rt::mpsc;
|
|
||||||
// use rt::time::timeout;
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::sync::Mutex;
|
use std::sync::Mutex;
|
||||||
@@ -20,6 +18,8 @@ use tracing_subscriber::fmt;
|
|||||||
use tracing_subscriber::prelude::__tracing_subscriber_SubscriberExt;
|
use tracing_subscriber::prelude::__tracing_subscriber_SubscriberExt;
|
||||||
use tracing_subscriber::util::SubscriberInitExt;
|
use tracing_subscriber::util::SubscriberInitExt;
|
||||||
|
|
||||||
|
mod xdg_dirs;
|
||||||
|
|
||||||
#[derive(Parser, Debug)]
|
#[derive(Parser, Debug)]
|
||||||
#[clap(
|
#[clap(
|
||||||
name = "ebay-scraper-rust",
|
name = "ebay-scraper-rust",
|
||||||
@@ -41,7 +41,7 @@ async fn listings_filtered_get(
|
|||||||
filter: web::Query<ListingsFilter>,
|
filter: web::Query<ListingsFilter>,
|
||||||
) -> Result<impl Responder> {
|
) -> Result<impl Responder> {
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
let res = db::listings_get_filtered(
|
let res = listings_get_filtered(
|
||||||
&db.lock().unwrap(),
|
&db.lock().unwrap(),
|
||||||
&DateTime::<Utc>::from_timestamp(filter.since.unwrap_or(0), 0).unwrap(),
|
&DateTime::<Utc>::from_timestamp(filter.since.unwrap_or(0), 0).unwrap(),
|
||||||
filter.limit.unwrap_or(1_000),
|
filter.limit.unwrap_or(1_000),
|
||||||
@@ -61,7 +61,7 @@ async fn listing_get(
|
|||||||
db: Data<Mutex<rusqlite::Connection>>,
|
db: Data<Mutex<rusqlite::Connection>>,
|
||||||
id: web::Path<i64>,
|
id: web::Path<i64>,
|
||||||
) -> Result<impl Responder> {
|
) -> Result<impl Responder> {
|
||||||
Ok(web::Json(db::Listing::lookup(&db.lock().unwrap(), *id)))
|
Ok(web::Json(Listing::lookup(&db.lock().unwrap(), *id)))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[get("/listing/{id}/parsed")]
|
#[get("/listing/{id}/parsed")]
|
||||||
@@ -69,10 +69,7 @@ async fn listing_parse_get(
|
|||||||
db: Data<Mutex<rusqlite::Connection>>,
|
db: Data<Mutex<rusqlite::Connection>>,
|
||||||
id: web::Path<i64>,
|
id: web::Path<i64>,
|
||||||
) -> Result<impl Responder> {
|
) -> Result<impl Responder> {
|
||||||
Ok(web::Json(db::ParsedStorage::lookup(
|
Ok(web::Json(ParsedStorage::lookup(&db.lock().unwrap(), *id)))
|
||||||
&db.lock().unwrap(),
|
|
||||||
*id,
|
|
||||||
)))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize)]
|
||||||
@@ -86,7 +83,7 @@ async fn listing_history_get(
|
|||||||
db: Data<Mutex<rusqlite::Connection>>,
|
db: Data<Mutex<rusqlite::Connection>>,
|
||||||
id: web::Path<i64>,
|
id: web::Path<i64>,
|
||||||
) -> Result<impl Responder> {
|
) -> Result<impl Responder> {
|
||||||
let history: Vec<_> = db::ItemAppearances::lookup(&db.lock().unwrap(), *id)
|
let history: Vec<_> = ItemAppearances::lookup(&db.lock().unwrap(), *id)
|
||||||
.iter()
|
.iter()
|
||||||
// .inspect(|e| info!("got: {:?}", e))
|
// .inspect(|e| info!("got: {:?}", e))
|
||||||
.filter_map(|e| {
|
.filter_map(|e| {
|
||||||
@@ -103,7 +100,7 @@ async fn listing_history_get(
|
|||||||
async fn parse_listings(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
|
async fn parse_listings(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
|
||||||
let mut cnt = 0;
|
let mut cnt = 0;
|
||||||
let db_unlocked = db.lock().unwrap();
|
let db_unlocked = db.lock().unwrap();
|
||||||
db::Listing::lookup_non_parsed(&db_unlocked)
|
Listing::lookup_non_parsed(&db_unlocked)
|
||||||
.iter()
|
.iter()
|
||||||
.map(|l| parser_storage::parse_size_and_quantity(l.0, &l.1))
|
.map(|l| parser_storage::parse_size_and_quantity(l.0, &l.1))
|
||||||
.inspect(|_| cnt = cnt + 1)
|
.inspect(|_| cnt = cnt + 1)
|
||||||
@@ -114,33 +111,7 @@ async fn parse_listings(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Re
|
|||||||
|
|
||||||
#[get("/category")]
|
#[get("/category")]
|
||||||
async fn category_getnames(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
|
async fn category_getnames(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
|
||||||
Ok(web::Json(db::SearchURL::names(&db.lock().unwrap())))
|
Ok(web::Json(SearchURL::names(&db.lock().unwrap())))
|
||||||
}
|
|
||||||
|
|
||||||
async fn category_discover_worker(
|
|
||||||
db: Data<Mutex<rusqlite::Connection>>,
|
|
||||||
downloaddir: Data<PathBuf>,
|
|
||||||
category: web::Path<String>,
|
|
||||||
) {
|
|
||||||
}
|
|
||||||
|
|
||||||
#[post("/category/{category}/discover")]
|
|
||||||
#[instrument(skip_all)]
|
|
||||||
async fn category_discover(
|
|
||||||
db: Data<Mutex<rusqlite::Connection>>,
|
|
||||||
downloaddir: Data<PathBuf>,
|
|
||||||
category: web::Path<String>,
|
|
||||||
) -> Result<impl Responder> {
|
|
||||||
let start = Instant::now();
|
|
||||||
parser::add_pages(
|
|
||||||
&db.lock().unwrap(),
|
|
||||||
&downloaddir.join(category.clone()),
|
|
||||||
&category,
|
|
||||||
);
|
|
||||||
let elapsed = start.elapsed().as_micros() as f64 / 1000.0;
|
|
||||||
|
|
||||||
info!("Added many pages to the category, took {elapsed} ms.");
|
|
||||||
Ok("")
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[post("/category/{category}/parse")]
|
#[post("/category/{category}/parse")]
|
||||||
@@ -149,41 +120,34 @@ async fn category_parse(
|
|||||||
db: Data<Mutex<rusqlite::Connection>>,
|
db: Data<Mutex<rusqlite::Connection>>,
|
||||||
downloaddir: Data<PathBuf>,
|
downloaddir: Data<PathBuf>,
|
||||||
category: web::Path<String>,
|
category: web::Path<String>,
|
||||||
) -> Result<impl Responder> {
|
|
||||||
parser::parse_pages(&db.lock().unwrap(), &downloaddir, &category, 100);
|
|
||||||
Ok("")
|
|
||||||
}
|
|
||||||
|
|
||||||
#[get("/category/{category}/parse")]
|
|
||||||
#[instrument(skip_all)]
|
|
||||||
async fn category_parse_get(
|
|
||||||
db: Data<Mutex<rusqlite::Connection>>,
|
|
||||||
category: web::Path<String>,
|
|
||||||
) -> Result<impl Responder> {
|
) -> Result<impl Responder> {
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
let stats = Page::category_stats(&db.lock().unwrap(), &category);
|
let count = parse_dir(
|
||||||
stats
|
&downloaddir.join(category.clone()),
|
||||||
.iter()
|
&category,
|
||||||
.for_each(|(status, cnt, total)| info!("{:?} {} {}", status, cnt, total));
|
&db.lock().unwrap(),
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
let elapsed = start.elapsed().as_micros() as f64 / 1000.0;
|
let elapsed = start.elapsed().as_micros() as f64 / 1000.0;
|
||||||
info!("Found, took {elapsed} ms.");
|
|
||||||
Ok(web::Json(stats))
|
info!("Added {count} listings, took {elapsed} ms.");
|
||||||
|
Ok(count.to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
#[get("/stats")]
|
#[get("/stats")]
|
||||||
async fn stats_get(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
|
async fn stats_get(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
|
||||||
Ok(web::Json(db::get_stats(&db.lock().unwrap())))
|
Ok(web::Json(get_stats(&db.lock().unwrap())))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[get("/admin")]
|
#[get("/admin")]
|
||||||
async fn admin_get(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
|
async fn admin_get(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
|
||||||
let db = db.lock().unwrap();
|
let db = db.lock().unwrap();
|
||||||
let query_start_time = Instant::now();
|
let query_start_time = Instant::now();
|
||||||
let search_urls = db::SearchURL::get_all(&db).unwrap_or_default();
|
let search_urls = SearchURL::get_all(&db).unwrap_or_default();
|
||||||
let parsed_pages = db::Page::get_all(&db).unwrap_or_default();
|
let parsed_pages = ParsedPage::get_all(&db).unwrap_or_default();
|
||||||
let parsed_storages = db::ParsedStorage::get_all(&db).unwrap_or_default();
|
let parsed_storages = ParsedStorage::get_all(&db).unwrap_or_default();
|
||||||
let item_appearances = db::ItemAppearances::get_all(&db).unwrap_or_default();
|
let item_appearances = ItemAppearances::get_all(&db).unwrap_or_default();
|
||||||
let listings = db::Listing::get_all(&db).unwrap_or_default();
|
let listings = Listing::get_all(&db).unwrap_or_default();
|
||||||
let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
|
let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
|
||||||
|
|
||||||
let html_gen_start_time = Instant::now();
|
let html_gen_start_time = Instant::now();
|
||||||
@@ -310,10 +274,6 @@ fn generate_table<T: Serialize>(title: &str, data: &[T]) -> String {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn pages_pickup() -> std::io::Result<()> {
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
#[actix_web::main]
|
#[actix_web::main]
|
||||||
async fn main() -> std::io::Result<()> {
|
async fn main() -> std::io::Result<()> {
|
||||||
tracing_subscriber::registry()
|
tracing_subscriber::registry()
|
||||||
@@ -327,10 +287,10 @@ async fn main() -> std::io::Result<()> {
|
|||||||
"Starting with scraped data dir of \"{}\".",
|
"Starting with scraped data dir of \"{}\".",
|
||||||
scrapedatadir.to_str().unwrap()
|
scrapedatadir.to_str().unwrap()
|
||||||
);
|
);
|
||||||
let db_mutex = Data::new(Mutex::new(db::get_initialized(None)));
|
let db_mutex = Data::new(Mutex::new(get_initialized(None)));
|
||||||
|
|
||||||
// Prepare our backend via pulling in what catagories we are preconfigured with.
|
// Prepare our backend via pulling in what catagories we are preconfigured with.
|
||||||
db::SearchURL::scan(&db_mutex.lock().unwrap(), &scrapedatadir, "url.json");
|
SearchURL::scan(&db_mutex.lock().unwrap(), &scrapedatadir, "url.json");
|
||||||
|
|
||||||
HttpServer::new(move || {
|
HttpServer::new(move || {
|
||||||
App::new()
|
App::new()
|
||||||
@@ -338,13 +298,10 @@ async fn main() -> std::io::Result<()> {
|
|||||||
.service(listing_get)
|
.service(listing_get)
|
||||||
.service(listings_filtered_get)
|
.service(listings_filtered_get)
|
||||||
.service(listing_history_get)
|
.service(listing_history_get)
|
||||||
.service(listing_parse_get)
|
|
||||||
// Category handlers
|
// Category handlers
|
||||||
.service(parse_listings)
|
.service(parse_listings)
|
||||||
.service(category_parse)
|
.service(category_parse)
|
||||||
.service(category_discover)
|
|
||||||
.service(category_getnames)
|
.service(category_getnames)
|
||||||
.service(category_parse_get)
|
|
||||||
// Gnarly info dump
|
// Gnarly info dump
|
||||||
.service(admin_get)
|
.service(admin_get)
|
||||||
.service(stats_get)
|
.service(stats_get)
|
||||||
@@ -355,6 +312,4 @@ async fn main() -> std::io::Result<()> {
|
|||||||
.bind(("0.0.0.0", 9876))?
|
.bind(("0.0.0.0", 9876))?
|
||||||
.run()
|
.run()
|
||||||
.await
|
.await
|
||||||
|
|
||||||
// tokio::join!(server, pages_pickup)
|
|
||||||
}
|
}
|
||||||
|
142
src/parser.rs
142
src/parser.rs
@@ -1,13 +1,14 @@
|
|||||||
use crate::{
|
use crate::{
|
||||||
db::{Page, ParsedPageStatus, SearchURL},
|
db::{ParsedPage, SearchURL},
|
||||||
parser_ebay,
|
parser_ebay,
|
||||||
};
|
};
|
||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use serde_json;
|
use serde_json;
|
||||||
use std::path::Path;
|
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
use std::{io::Read, path::Path};
|
||||||
use tracing::{debug, error, info};
|
use tracing::{debug, error, info};
|
||||||
|
use zstd;
|
||||||
|
|
||||||
fn timestamps_from_dir(path: &Path) -> Vec<i64> {
|
fn timestamps_from_dir(path: &Path) -> Vec<i64> {
|
||||||
if !std::fs::exists(path).expect("Directory must exist") {
|
if !std::fs::exists(path).expect("Directory must exist") {
|
||||||
@@ -20,23 +21,52 @@ fn timestamps_from_dir(path: &Path) -> Vec<i64> {
|
|||||||
std::fs::read_dir(path)
|
std::fs::read_dir(path)
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.map(|fpath| fpath.unwrap().path())
|
.map(|fpath| fpath.unwrap().path())
|
||||||
.filter_map(|fstem| {
|
.filter_map(|fname| {
|
||||||
fstem
|
// Turns out file_stem() doesn't handle multiple extensions and
|
||||||
.file_stem()
|
// file_prefix() is still in not stable.
|
||||||
.and_then(|s| s.to_str())
|
Some(fname.file_stem()?.to_str()?.split_once('.')?.0.to_owned())
|
||||||
.expect("Invalid file name")
|
|
||||||
.parse()
|
|
||||||
.ok()
|
|
||||||
})
|
})
|
||||||
|
.filter_map(|fname| fname.parse().ok())
|
||||||
.collect()
|
.collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_pages(db: &rusqlite::Connection, dir: &Path, category: &str) {
|
fn read_timestamp_from_dir(
|
||||||
|
dir: &Path,
|
||||||
|
timestamp: &chrono::DateTime<chrono::Utc>,
|
||||||
|
) -> Option<String> {
|
||||||
|
// First check for the normal html version, which we can just read straight.
|
||||||
|
let page_path = dir.join(format!("{}.{}", timestamp.timestamp(), "html"));
|
||||||
|
if page_path.exists() {
|
||||||
|
return std::fs::read_to_string(&page_path)
|
||||||
|
.inspect_err(|e| error!("Failed reading {}, error:{e}", page_path.display()))
|
||||||
|
.ok();
|
||||||
|
}
|
||||||
|
|
||||||
|
// And now if it's compresed but with zstd.
|
||||||
|
let page_path = dir.join(format!("{}.{}.{}", timestamp.timestamp(), "html", "zst"));
|
||||||
|
if page_path.exists() {
|
||||||
|
let f = std::fs::File::open(&page_path)
|
||||||
|
.inspect_err(|e| error!("Failed reading {}, error:{e}", page_path.display()))
|
||||||
|
.ok()?;
|
||||||
|
let mut s = String::new();
|
||||||
|
zstd::Decoder::new(f).ok()?.read_to_string(&mut s).ok()?;
|
||||||
|
return Some(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
error!(
|
||||||
|
"Failed to lookup file for timestamp {} in {}, bailing ...",
|
||||||
|
timestamp,
|
||||||
|
dir.display()
|
||||||
|
);
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn parse_dir(dir: &Path, category: &str, db: &rusqlite::Connection) -> Option<usize> {
|
||||||
// Ensure the category is created.
|
// Ensure the category is created.
|
||||||
let url_fpath = dir.join("url.json");
|
let url_fpath = dir.join("url.json");
|
||||||
let url_contents = std::fs::read_to_string(&url_fpath)
|
let url_contents = std::fs::read_to_string(&url_fpath)
|
||||||
.inspect_err(|e| error!("Failed reading {}: {e}", url_fpath.display()))
|
.inspect_err(|e| error!("Failed reading {}: {e}", url_fpath.display()))
|
||||||
.unwrap();
|
.ok()?;
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
#[derive(Deserialize)]
|
||||||
struct URLJSON {
|
struct URLJSON {
|
||||||
@@ -50,75 +80,59 @@ pub fn add_pages(db: &rusqlite::Connection, dir: &Path, category: &str) {
|
|||||||
|
|
||||||
// See all pages haven't been seen before.
|
// See all pages haven't been seen before.
|
||||||
let query_start_time = Instant::now();
|
let query_start_time = Instant::now();
|
||||||
let to_parse = timestamps_from_dir(dir)
|
let to_parse = timestamps_from_dir(dir).into_iter().filter(|t| {
|
||||||
.into_iter()
|
let ts = chrono::DateTime::from_timestamp(*t, 0).unwrap();
|
||||||
.filter(|t| {
|
let p = ParsedPage::lookup(&db, ts);
|
||||||
let ts = chrono::DateTime::from_timestamp(*t, 0).unwrap();
|
|
||||||
let p = Page::lookup(&db, ts);
|
|
||||||
|
|
||||||
// Timestamp never seen before, lets pass it on.
|
|
||||||
if p.is_none() {
|
|
||||||
info!(
|
|
||||||
"Page Timestamp:{} Catagory:{category} never seen before",
|
|
||||||
ts.timestamp()
|
|
||||||
);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Timestamp was seen before *and* from the same catagory, don't pass it on.
|
|
||||||
if p.unwrap().category == *category {
|
|
||||||
info!(
|
|
||||||
"Page Timestamp:{} Catagory:{category} seen before, skipping",
|
|
||||||
ts.timestamp()
|
|
||||||
);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
// Timestamp never seen before, lets pass it on.
|
||||||
|
if p.is_none() {
|
||||||
info!(
|
info!(
|
||||||
"Page Timestamp:{} Catagory:{category} seen before, but not of catagory:{category}",
|
"Page Timestamp:{} Catagory:{category} never seen before, processing ...",
|
||||||
ts.timestamp()
|
ts.timestamp()
|
||||||
);
|
);
|
||||||
return true;
|
return true;
|
||||||
})
|
}
|
||||||
.collect::<Vec<_>>();
|
|
||||||
|
// Timestamp was seen before *and* from the same catagory, don't pass
|
||||||
|
// it on.
|
||||||
|
if p.unwrap().category == *category {
|
||||||
|
info!(
|
||||||
|
"Page Timestamp:{} Catagory:{category} seen before, skipping ...",
|
||||||
|
ts.timestamp()
|
||||||
|
);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
info!(
|
||||||
|
"Page Timestamp:{} Catagory:{category} seen before, but not of catagory:{category}, processing ...",
|
||||||
|
ts.timestamp()
|
||||||
|
);
|
||||||
|
return true;
|
||||||
|
}).collect::<Vec<_>>();
|
||||||
let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
|
let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
|
||||||
info!("Time spent finding pages to parse:{total_query_time} ms");
|
info!("Time spent finding pages to parse:{total_query_time} ms");
|
||||||
|
|
||||||
// Say we are going to parse the pages.
|
// For each page, read the file and parse it.
|
||||||
let query_start_time = Instant::now();
|
|
||||||
let pages = to_parse.iter().map(|p| Page {
|
|
||||||
timestamp: chrono::DateTime::from_timestamp(*p, 0).unwrap(),
|
|
||||||
category: category.to_string(),
|
|
||||||
status: crate::db::ParsedPageStatus::PendingParse,
|
|
||||||
});
|
|
||||||
for p in pages {
|
|
||||||
p.add_or_update(&db);
|
|
||||||
}
|
|
||||||
let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
|
|
||||||
info!("Time spent inserting pages marked as ready to parse:{total_query_time} ms");
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn parse_pages(db: &rusqlite::Connection, dir: &Path, category: &str, batch: usize) {
|
|
||||||
let to_parse = Page::lookup_status(&db, ParsedPageStatus::PendingParse, category, batch);
|
|
||||||
let query_start_time = Instant::now();
|
let query_start_time = Instant::now();
|
||||||
let to_add = to_parse
|
let to_add = to_parse
|
||||||
.par_iter()
|
.par_iter()
|
||||||
.map(|p| {
|
.map(|p| {
|
||||||
let page_path = dir
|
let ts = chrono::DateTime::from_timestamp(*p, 0).unwrap();
|
||||||
.join(category)
|
let paged_info = ParsedPage {
|
||||||
.join(format!("{}.html", p.timestamp.timestamp()));
|
timestamp: ts,
|
||||||
let page_contents = std::fs::read_to_string(&page_path)
|
category: category.to_string(),
|
||||||
.inspect_err(|e| error!("Failed reading {}, error:{e}", page_path.display()))
|
};
|
||||||
.ok()?;
|
|
||||||
|
let page_contents = read_timestamp_from_dir(dir, &ts)?;
|
||||||
let elements =
|
let elements =
|
||||||
parser_ebay::parse_from_ebay_page(&page_contents, &p.timestamp, &category).unwrap();
|
parser_ebay::parse_from_ebay_page(&page_contents, &ts, &category).unwrap();
|
||||||
info!(
|
info!(
|
||||||
"Page Timestamp:{} Catagory:{category}, found {} elements",
|
"Page Timestamp:{} Catagory:{category}, found {} elements",
|
||||||
p.timestamp.timestamp(),
|
ts.timestamp(),
|
||||||
elements.len()
|
elements.len()
|
||||||
);
|
);
|
||||||
|
|
||||||
Some((p, elements))
|
Some((paged_info, elements))
|
||||||
})
|
})
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
|
let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
|
||||||
@@ -126,6 +140,7 @@ pub fn parse_pages(db: &rusqlite::Connection, dir: &Path, category: &str, batch:
|
|||||||
|
|
||||||
// And lastly add it to our database!
|
// And lastly add it to our database!
|
||||||
let query_start_time = Instant::now();
|
let query_start_time = Instant::now();
|
||||||
|
let mut added_count = 0;
|
||||||
for iter in to_add {
|
for iter in to_add {
|
||||||
if iter.is_none() {
|
if iter.is_none() {
|
||||||
continue;
|
continue;
|
||||||
@@ -134,6 +149,7 @@ pub fn parse_pages(db: &rusqlite::Connection, dir: &Path, category: &str, batch:
|
|||||||
paged_info.add_or_update(&db);
|
paged_info.add_or_update(&db);
|
||||||
|
|
||||||
for e in elements {
|
for e in elements {
|
||||||
|
added_count += 1;
|
||||||
e.0.add_or_update(&db);
|
e.0.add_or_update(&db);
|
||||||
e.1.add_or_update(&db);
|
e.1.add_or_update(&db);
|
||||||
debug!(
|
debug!(
|
||||||
@@ -146,4 +162,6 @@ pub fn parse_pages(db: &rusqlite::Connection, dir: &Path, category: &str, batch:
|
|||||||
}
|
}
|
||||||
let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
|
let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
|
||||||
info!("Time spent adding parsed pages: {total_query_time} ms");
|
info!("Time spent adding parsed pages: {total_query_time} ms");
|
||||||
|
|
||||||
|
return Some(added_count);
|
||||||
}
|
}
|
||||||
|
@@ -172,12 +172,22 @@ pub fn parse_from_ebay_page(
|
|||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use similar_asserts::assert_eq;
|
use similar_asserts::assert_eq;
|
||||||
|
use std::io::Read;
|
||||||
|
use zstd;
|
||||||
|
|
||||||
#[test_log::test]
|
#[test_log::test]
|
||||||
fn parse() {
|
fn parse() {
|
||||||
let timestamp = chrono::DateTime::from_timestamp(1750369463, 0).unwrap();
|
let timestamp = chrono::DateTime::from_timestamp(1750369463, 0).unwrap();
|
||||||
let html = include_str!("../test_data/scraper/raw_scraped/ssd/1750369463.html");
|
let zstd = include_bytes!("../test_data/scraper/raw_scraped/ssd/1750369463.html.zst");
|
||||||
let parsed = parse_from_ebay_page(html, ×tamp, "ssd").unwrap();
|
let cursor = std::io::Cursor::new(zstd);
|
||||||
|
|
||||||
|
let mut html = String::new();
|
||||||
|
zstd::Decoder::new(cursor)
|
||||||
|
.unwrap()
|
||||||
|
.read_to_string(&mut html)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let parsed = parse_from_ebay_page(&html, ×tamp, "ssd").unwrap();
|
||||||
// assert_eq!(parsed.len(), 62);
|
// assert_eq!(parsed.len(), 62);
|
||||||
|
|
||||||
let parsed = parsed.first_chunk::<10>().unwrap();
|
let parsed = parsed.first_chunk::<10>().unwrap();
|
||||||
|
@@ -5,6 +5,8 @@ URL_PER_PAGE_240="&_ipg=240"
|
|||||||
URL_MIN_PRICE_USD_60="&_udlo=60.00"
|
URL_MIN_PRICE_USD_60="&_udlo=60.00"
|
||||||
URL_SEARCHTERM_NONE="&_nkw="
|
URL_SEARCHTERM_NONE="&_nkw="
|
||||||
URL_LOCATION_NORTHAMERICA="&LH_PrefLoc=3"
|
URL_LOCATION_NORTHAMERICA="&LH_PrefLoc=3"
|
||||||
|
URL_SORTBY_NEWLY_LISTED="&_sop=10"
|
||||||
|
URL_SORTBY_ENDING_SOONEST="&_sop=1"
|
||||||
URL_BASE_LISTING="https://www.ebay.com/sch/i.html?"
|
URL_BASE_LISTING="https://www.ebay.com/sch/i.html?"
|
||||||
|
|
||||||
if [ -z "${XDG_DATA_HOME}" ]; then
|
if [ -z "${XDG_DATA_HOME}" ]; then
|
||||||
@@ -12,20 +14,85 @@ if [ -z "${XDG_DATA_HOME}" ]; then
|
|||||||
exit
|
exit
|
||||||
fi
|
fi
|
||||||
|
|
||||||
DIR_SSDS="$XDG_DATA_HOME/ebay_scraper/raw_scraped/ssd"
|
DIR_SSDS="$XDG_DATA_HOME/scraper/raw_scraped/ssd"
|
||||||
mkdir -p "$DIR_SSDS"
|
mkdir -p "$DIR_SSDS"
|
||||||
if [ ! -s "$DIR_SSDS/url.json" ]; then
|
if [ ! -s "$DIR_SSDS/url.json" ]; then
|
||||||
URL_CATEGORY_SSD="&_sacat=175669"
|
URL_CATEGORY_SSD="&_sacat=175669"
|
||||||
URL_SSDS="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_SSD&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1&_sop=10$URL_PER_PAGE_240"
|
URL_SSDS="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_SSD&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1$URL_PER_PAGE_240"
|
||||||
echo "{\"url\": \"$URL_SSDS\"}" > "$DIR_SSDS/url.json"
|
echo "{\"url\": \"$URL_SSDS\"}" > "$DIR_SSDS/url.json"
|
||||||
fi
|
fi
|
||||||
wget -o "$DIR_SSDS/$(date +%s).html" "$(jq '.url' $DIR_SSDS/url.json)"
|
curl "$(jq '.url' $DIR_SSDS/url.json --raw-output)$URL_SORTBY_NEWLY_LISTED" | zstd -z --ultra -19 -o "$DIR_SSDS/$(date +%s).html.zst"
|
||||||
|
sleep 2
|
||||||
|
curl "$(jq '.url' $DIR_SSDS/url.json --raw-output)$URL_SORTBY_ENDING_SOONEST" | zstd -z --ultra -19 -o "$DIR_SSDS/$(date +%s).html.zst"
|
||||||
|
|
||||||
DIR_MINIPC="$XDG_DATA_HOME/ebay_scraper/raw_scraped/minipc"
|
DIR_MINIPC="$XDG_DATA_HOME/scraper/raw_scraped/minipc"
|
||||||
mkdir -p "$DIR_MINIPC"
|
mkdir -p "$DIR_MINIPC"
|
||||||
if [ ! -s "$DIR_MINIPC/url.json" ]; then
|
if [ ! -s "$DIR_MINIPC/url.json" ]; then
|
||||||
URL_CATEGORY_MINIPC_ALLINONE="&_sacat=179"
|
URL_CATEGORY_MINIPC_ALLINONE="&_sacat=179"
|
||||||
URL_MINIPC="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_MINIPC_ALLINONE&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1&_sop=10$URL_PER_PAGE_240"
|
URL_MINIPC="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_MINIPC_ALLINONE&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1$URL_PER_PAGE_240"
|
||||||
echo "{\"url\": \"$URL_MINIPC\"}" > "$DIR_MINIPC/url.json"
|
echo "{\"url\": \"$URL_MINIPC\"}" > "$DIR_MINIPC/url.json"
|
||||||
fi
|
fi
|
||||||
wget -o "$DIR_MINIPC/$(date +%s).html" "$(jq '.url' $DIR_MINIPC/url.json)"
|
curl "$(jq '.url' $DIR_MINIPC/url.json --raw-output)$URL_SORTBY_NEWLY_LISTED" | zstd -z --ultra -19 -o "$DIR_MINIPC/$(date +%s).html.zst"
|
||||||
|
sleep 2
|
||||||
|
curl "$(jq '.url' $DIR_MINIPC/url.json --raw-output)$URL_SORTBY_ENDING_SOONEST" | zstd -z --ultra -19 -o "$DIR_MINIPC/$(date +%s).html.zst"
|
||||||
|
|
||||||
|
# Little helper to ensure we see entries in journald
|
||||||
|
echo Done
|
||||||
|
|
||||||
|
# If needing to do a mass compression;
|
||||||
|
# fd '\.html$' -x zstd -z --ultra -19 -o {}.zst {}
|
||||||
|
|
||||||
|
# Level compression analysis;
|
||||||
|
#
|
||||||
|
# A single scraped result;
|
||||||
|
# for lvl in $(seq 3 22); zstd --compress --ultra -o 1755012328.html.zst$lvl -$lvl 1755012328.html; end
|
||||||
|
# 1755012328.html : 9.04% ( 2.60 MiB => 240 KiB, 1755012328.html.zst3)
|
||||||
|
# 1755012328.html : 9.04% ( 2.60 MiB => 240 KiB, 1755012328.html.zst4)
|
||||||
|
# 1755012328.html : 8.80% ( 2.60 MiB => 234 KiB, 1755012328.html.zst5)
|
||||||
|
# 1755012328.html : 8.58% ( 2.60 MiB => 228 KiB, 1755012328.html.zst6)
|
||||||
|
# 1755012328.html : 8.54% ( 2.60 MiB => 227 KiB, 1755012328.html.zst7)
|
||||||
|
# 1755012328.html : 8.45% ( 2.60 MiB => 225 KiB, 1755012328.html.zst8)
|
||||||
|
# 1755012328.html : 8.34% ( 2.60 MiB => 222 KiB, 1755012328.html.zst9)
|
||||||
|
# 1755012328.html : 8.30% ( 2.60 MiB => 221 KiB, 1755012328.html.zst10)
|
||||||
|
# 1755012328.html : 8.28% ( 2.60 MiB => 220 KiB, 1755012328.html.zst11)
|
||||||
|
# 1755012328.html : 8.28% ( 2.60 MiB => 220 KiB, 1755012328.html.zst12)
|
||||||
|
# 1755012328.html : 8.32% ( 2.60 MiB => 221 KiB, 1755012328.html.zst13)
|
||||||
|
# 1755012328.html : 8.29% ( 2.60 MiB => 221 KiB, 1755012328.html.zst14)
|
||||||
|
# 1755012328.html : 8.25% ( 2.60 MiB => 219 KiB, 1755012328.html.zst15)
|
||||||
|
# 1755012328.html : 8.28% ( 2.60 MiB => 220 KiB, 1755012328.html.zst16)
|
||||||
|
# 1755012328.html : 8.20% ( 2.60 MiB => 218 KiB, 1755012328.html.zst17)
|
||||||
|
# 1755012328.html : 8.23% ( 2.60 MiB => 219 KiB, 1755012328.html.zst18)
|
||||||
|
# 1755012328.html : 7.99% ( 2.60 MiB => 213 KiB, 1755012328.html.zst19)
|
||||||
|
# 1755012328.html : 7.99% ( 2.60 MiB => 213 KiB, 1755012328.html.zst20)
|
||||||
|
# 1755012328.html : 7.93% ( 2.60 MiB => 211 KiB, 1755012328.html.zst21)
|
||||||
|
# 1755012328.html : 7.91% ( 2.60 MiB => 211 KiB, 1755012328.html.zst22)
|
||||||
|
#
|
||||||
|
# Lets see if we get benefits tar'ing and them compressing;
|
||||||
|
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755012328.html
|
||||||
|
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755012331.html
|
||||||
|
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755015932.html
|
||||||
|
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755015929.html
|
||||||
|
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755019567.html
|
||||||
|
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755019564.html
|
||||||
|
# -rw-r--r-- 1 hak8or users 16M Sep 1 12:23 175501.tar
|
||||||
|
# ➜ for lvl in $(seq 3 22); zstd --compress --ultra -o 175501.tar.$lvl -$lvl 175501.tar; end
|
||||||
|
# 175501.tar : 8.91% ( 15.6 MiB => 1.39 MiB, 175501.tar.3)
|
||||||
|
# 175501.tar : 8.92% ( 15.6 MiB => 1.39 MiB, 175501.tar.4)
|
||||||
|
# 175501.tar : 8.65% ( 15.6 MiB => 1.35 MiB, 175501.tar.5)
|
||||||
|
# 175501.tar : 8.42% ( 15.6 MiB => 1.31 MiB, 175501.tar.6)
|
||||||
|
# 175501.tar : 8.36% ( 15.6 MiB => 1.30 MiB, 175501.tar.7)
|
||||||
|
# 175501.tar : 8.25% ( 15.6 MiB => 1.28 MiB, 175501.tar.8)
|
||||||
|
# 175501.tar : 5.36% ( 15.6 MiB => 854 KiB, 175501.tar.9)
|
||||||
|
# 175501.tar : 5.32% ( 15.6 MiB => 847 KiB, 175501.tar.10)
|
||||||
|
# 175501.tar : 5.30% ( 15.6 MiB => 844 KiB, 175501.tar.11)
|
||||||
|
# 175501.tar : 5.30% ( 15.6 MiB => 844 KiB, 175501.tar.12)
|
||||||
|
# 175501.tar : 5.48% ( 15.6 MiB => 872 KiB, 175501.tar.13)
|
||||||
|
# 175501.tar : 5.42% ( 15.6 MiB => 864 KiB, 175501.tar.14)
|
||||||
|
# 175501.tar : 5.19% ( 15.6 MiB => 828 KiB, 175501.tar.15)
|
||||||
|
# 175501.tar : 5.31% ( 15.6 MiB => 845 KiB, 175501.tar.16)
|
||||||
|
# 175501.tar : 5.01% ( 15.6 MiB => 798 KiB, 175501.tar.17)
|
||||||
|
# 175501.tar : 5.04% ( 15.6 MiB => 803 KiB, 175501.tar.18)
|
||||||
|
# 175501.tar : 4.84% ( 15.6 MiB => 771 KiB, 175501.tar.19)
|
||||||
|
# 175501.tar : 4.79% ( 15.6 MiB => 764 KiB, 175501.tar.20)
|
||||||
|
# 175501.tar : 4.74% ( 15.6 MiB => 755 KiB, 175501.tar.21)
|
||||||
|
# 175501.tar : 4.73% ( 15.6 MiB => 753 KiB, 175501.tar.22)
|
||||||
|
File diff suppressed because one or more lines are too long
BIN
test_data/scraper/raw_scraped/ssd/1750369463.html.zst
Normal file
BIN
test_data/scraper/raw_scraped/ssd/1750369463.html.zst
Normal file
Binary file not shown.
Reference in New Issue
Block a user