Compare commits
1 Commits
parallel_s
...
main
Author | SHA1 | Date | |
---|---|---|---|
b538dd8012 |
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -690,6 +690,7 @@ dependencies = [
|
||||
"test-log",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"zstd",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@@ -18,6 +18,7 @@ serde_json = "1.0.140"
|
||||
test-log = { version = "0.2.17", features = ["trace"] }
|
||||
tracing = { version = "0.1.41", features = ["attributes"] }
|
||||
tracing-subscriber = { version = "0.3.19", features = ["env-filter"] }
|
||||
zstd = "0.13.3"
|
||||
|
||||
[dev-dependencies]
|
||||
similar-asserts = "1.7.0"
|
||||
|
@@ -5,9 +5,10 @@ use crate::{
|
||||
use rayon::prelude::*;
|
||||
use serde::Deserialize;
|
||||
use serde_json;
|
||||
use std::path::Path;
|
||||
use std::time::Instant;
|
||||
use std::{io::Read, path::Path};
|
||||
use tracing::{debug, error, info};
|
||||
use zstd;
|
||||
|
||||
fn timestamps_from_dir(path: &Path) -> Vec<i64> {
|
||||
if !std::fs::exists(path).expect("Directory must exist") {
|
||||
@@ -20,17 +21,46 @@ fn timestamps_from_dir(path: &Path) -> Vec<i64> {
|
||||
std::fs::read_dir(path)
|
||||
.unwrap()
|
||||
.map(|fpath| fpath.unwrap().path())
|
||||
.filter_map(|fstem| {
|
||||
fstem
|
||||
.file_stem()
|
||||
.and_then(|s| s.to_str())
|
||||
.expect("Invalid file name")
|
||||
.parse()
|
||||
.ok()
|
||||
.filter_map(|fname| {
|
||||
// Turns out file_stem() doesn't handle multiple extensions and
|
||||
// file_prefix() is still in not stable.
|
||||
Some(fname.file_stem()?.to_str()?.split_once('.')?.0.to_owned())
|
||||
})
|
||||
.filter_map(|fname| fname.parse().ok())
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn read_timestamp_from_dir(
|
||||
dir: &Path,
|
||||
timestamp: &chrono::DateTime<chrono::Utc>,
|
||||
) -> Option<String> {
|
||||
// First check for the normal html version, which we can just read straight.
|
||||
let page_path = dir.join(format!("{}.{}", timestamp.timestamp(), "html"));
|
||||
if page_path.exists() {
|
||||
return std::fs::read_to_string(&page_path)
|
||||
.inspect_err(|e| error!("Failed reading {}, error:{e}", page_path.display()))
|
||||
.ok();
|
||||
}
|
||||
|
||||
// And now if it's compresed but with zstd.
|
||||
let page_path = dir.join(format!("{}.{}.{}", timestamp.timestamp(), "html", "zst"));
|
||||
if page_path.exists() {
|
||||
let f = std::fs::File::open(&page_path)
|
||||
.inspect_err(|e| error!("Failed reading {}, error:{e}", page_path.display()))
|
||||
.ok()?;
|
||||
let mut s = String::new();
|
||||
zstd::Decoder::new(f).ok()?.read_to_string(&mut s).ok()?;
|
||||
return Some(s);
|
||||
}
|
||||
|
||||
error!(
|
||||
"Failed to lookup file for timestamp {} in {}, bailing ...",
|
||||
timestamp,
|
||||
dir.display()
|
||||
);
|
||||
None
|
||||
}
|
||||
|
||||
pub fn parse_dir(dir: &Path, category: &str, db: &rusqlite::Connection) -> Option<usize> {
|
||||
// Ensure the category is created.
|
||||
let url_fpath = dir.join("url.json");
|
||||
@@ -93,10 +123,7 @@ pub fn parse_dir(dir: &Path, category: &str, db: &rusqlite::Connection) -> Optio
|
||||
category: category.to_string(),
|
||||
};
|
||||
|
||||
let page_path = dir.join(format!("{}.html", ts.timestamp()));
|
||||
let page_contents = std::fs::read_to_string(&page_path)
|
||||
.inspect_err(|e| error!("Failed reading {}, error:{e}", page_path.display()))
|
||||
.ok()?;
|
||||
let page_contents = read_timestamp_from_dir(dir, &ts)?;
|
||||
let elements =
|
||||
parser_ebay::parse_from_ebay_page(&page_contents, &ts, &category).unwrap();
|
||||
info!(
|
||||
|
@@ -172,12 +172,22 @@ pub fn parse_from_ebay_page(
|
||||
mod tests {
|
||||
use super::*;
|
||||
use similar_asserts::assert_eq;
|
||||
use std::io::Read;
|
||||
use zstd;
|
||||
|
||||
#[test_log::test]
|
||||
fn parse() {
|
||||
let timestamp = chrono::DateTime::from_timestamp(1750369463, 0).unwrap();
|
||||
let html = include_str!("../test_data/scraper/raw_scraped/ssd/1750369463.html");
|
||||
let parsed = parse_from_ebay_page(html, ×tamp, "ssd").unwrap();
|
||||
let zstd = include_bytes!("../test_data/scraper/raw_scraped/ssd/1750369463.html.zst");
|
||||
let cursor = std::io::Cursor::new(zstd);
|
||||
|
||||
let mut html = String::new();
|
||||
zstd::Decoder::new(cursor)
|
||||
.unwrap()
|
||||
.read_to_string(&mut html)
|
||||
.unwrap();
|
||||
|
||||
let parsed = parse_from_ebay_page(&html, ×tamp, "ssd").unwrap();
|
||||
// assert_eq!(parsed.len(), 62);
|
||||
|
||||
let parsed = parsed.first_chunk::<10>().unwrap();
|
||||
|
@@ -5,6 +5,8 @@ URL_PER_PAGE_240="&_ipg=240"
|
||||
URL_MIN_PRICE_USD_60="&_udlo=60.00"
|
||||
URL_SEARCHTERM_NONE="&_nkw="
|
||||
URL_LOCATION_NORTHAMERICA="&LH_PrefLoc=3"
|
||||
URL_SORTBY_NEWLY_LISTED="&_sop=10"
|
||||
URL_SORTBY_ENDING_SOONEST="&_sop=1"
|
||||
URL_BASE_LISTING="https://www.ebay.com/sch/i.html?"
|
||||
|
||||
if [ -z "${XDG_DATA_HOME}" ]; then
|
||||
@@ -12,20 +14,85 @@ if [ -z "${XDG_DATA_HOME}" ]; then
|
||||
exit
|
||||
fi
|
||||
|
||||
DIR_SSDS="$XDG_DATA_HOME/ebay_scraper/raw_scraped/ssd"
|
||||
DIR_SSDS="$XDG_DATA_HOME/scraper/raw_scraped/ssd"
|
||||
mkdir -p "$DIR_SSDS"
|
||||
if [ ! -s "$DIR_SSDS/url.json" ]; then
|
||||
URL_CATEGORY_SSD="&_sacat=175669"
|
||||
URL_SSDS="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_SSD&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1&_sop=10$URL_PER_PAGE_240"
|
||||
URL_SSDS="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_SSD&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1$URL_PER_PAGE_240"
|
||||
echo "{\"url\": \"$URL_SSDS\"}" > "$DIR_SSDS/url.json"
|
||||
fi
|
||||
wget -o "$DIR_SSDS/$(date +%s).html" "$(jq '.url' $DIR_SSDS/url.json)"
|
||||
curl "$(jq '.url' $DIR_SSDS/url.json --raw-output)$URL_SORTBY_NEWLY_LISTED" | zstd -z --ultra -19 -o "$DIR_SSDS/$(date +%s).html.zst"
|
||||
sleep 2
|
||||
curl "$(jq '.url' $DIR_SSDS/url.json --raw-output)$URL_SORTBY_ENDING_SOONEST" | zstd -z --ultra -19 -o "$DIR_SSDS/$(date +%s).html.zst"
|
||||
|
||||
DIR_MINIPC="$XDG_DATA_HOME/ebay_scraper/raw_scraped/minipc"
|
||||
DIR_MINIPC="$XDG_DATA_HOME/scraper/raw_scraped/minipc"
|
||||
mkdir -p "$DIR_MINIPC"
|
||||
if [ ! -s "$DIR_MINIPC/url.json" ]; then
|
||||
URL_CATEGORY_MINIPC_ALLINONE="&_sacat=179"
|
||||
URL_MINIPC="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_MINIPC_ALLINONE&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1&_sop=10$URL_PER_PAGE_240"
|
||||
URL_MINIPC="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_MINIPC_ALLINONE&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1$URL_PER_PAGE_240"
|
||||
echo "{\"url\": \"$URL_MINIPC\"}" > "$DIR_MINIPC/url.json"
|
||||
fi
|
||||
wget -o "$DIR_MINIPC/$(date +%s).html" "$(jq '.url' $DIR_MINIPC/url.json)"
|
||||
curl "$(jq '.url' $DIR_MINIPC/url.json --raw-output)$URL_SORTBY_NEWLY_LISTED" | zstd -z --ultra -19 -o "$DIR_MINIPC/$(date +%s).html.zst"
|
||||
sleep 2
|
||||
curl "$(jq '.url' $DIR_MINIPC/url.json --raw-output)$URL_SORTBY_ENDING_SOONEST" | zstd -z --ultra -19 -o "$DIR_MINIPC/$(date +%s).html.zst"
|
||||
|
||||
# Little helper to ensure we see entries in journald
|
||||
echo Done
|
||||
|
||||
# If needing to do a mass compression;
|
||||
# fd '\.html$' -x zstd -z --ultra -19 -o {}.zst {}
|
||||
|
||||
# Level compression analysis;
|
||||
#
|
||||
# A single scraped result;
|
||||
# for lvl in $(seq 3 22); zstd --compress --ultra -o 1755012328.html.zst$lvl -$lvl 1755012328.html; end
|
||||
# 1755012328.html : 9.04% ( 2.60 MiB => 240 KiB, 1755012328.html.zst3)
|
||||
# 1755012328.html : 9.04% ( 2.60 MiB => 240 KiB, 1755012328.html.zst4)
|
||||
# 1755012328.html : 8.80% ( 2.60 MiB => 234 KiB, 1755012328.html.zst5)
|
||||
# 1755012328.html : 8.58% ( 2.60 MiB => 228 KiB, 1755012328.html.zst6)
|
||||
# 1755012328.html : 8.54% ( 2.60 MiB => 227 KiB, 1755012328.html.zst7)
|
||||
# 1755012328.html : 8.45% ( 2.60 MiB => 225 KiB, 1755012328.html.zst8)
|
||||
# 1755012328.html : 8.34% ( 2.60 MiB => 222 KiB, 1755012328.html.zst9)
|
||||
# 1755012328.html : 8.30% ( 2.60 MiB => 221 KiB, 1755012328.html.zst10)
|
||||
# 1755012328.html : 8.28% ( 2.60 MiB => 220 KiB, 1755012328.html.zst11)
|
||||
# 1755012328.html : 8.28% ( 2.60 MiB => 220 KiB, 1755012328.html.zst12)
|
||||
# 1755012328.html : 8.32% ( 2.60 MiB => 221 KiB, 1755012328.html.zst13)
|
||||
# 1755012328.html : 8.29% ( 2.60 MiB => 221 KiB, 1755012328.html.zst14)
|
||||
# 1755012328.html : 8.25% ( 2.60 MiB => 219 KiB, 1755012328.html.zst15)
|
||||
# 1755012328.html : 8.28% ( 2.60 MiB => 220 KiB, 1755012328.html.zst16)
|
||||
# 1755012328.html : 8.20% ( 2.60 MiB => 218 KiB, 1755012328.html.zst17)
|
||||
# 1755012328.html : 8.23% ( 2.60 MiB => 219 KiB, 1755012328.html.zst18)
|
||||
# 1755012328.html : 7.99% ( 2.60 MiB => 213 KiB, 1755012328.html.zst19)
|
||||
# 1755012328.html : 7.99% ( 2.60 MiB => 213 KiB, 1755012328.html.zst20)
|
||||
# 1755012328.html : 7.93% ( 2.60 MiB => 211 KiB, 1755012328.html.zst21)
|
||||
# 1755012328.html : 7.91% ( 2.60 MiB => 211 KiB, 1755012328.html.zst22)
|
||||
#
|
||||
# Lets see if we get benefits tar'ing and them compressing;
|
||||
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755012328.html
|
||||
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755012331.html
|
||||
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755015932.html
|
||||
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755015929.html
|
||||
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755019567.html
|
||||
# -rw-r--r-- 1 hak8or users 2.6M Sep 1 10:11 ./1755019564.html
|
||||
# -rw-r--r-- 1 hak8or users 16M Sep 1 12:23 175501.tar
|
||||
# ➜ for lvl in $(seq 3 22); zstd --compress --ultra -o 175501.tar.$lvl -$lvl 175501.tar; end
|
||||
# 175501.tar : 8.91% ( 15.6 MiB => 1.39 MiB, 175501.tar.3)
|
||||
# 175501.tar : 8.92% ( 15.6 MiB => 1.39 MiB, 175501.tar.4)
|
||||
# 175501.tar : 8.65% ( 15.6 MiB => 1.35 MiB, 175501.tar.5)
|
||||
# 175501.tar : 8.42% ( 15.6 MiB => 1.31 MiB, 175501.tar.6)
|
||||
# 175501.tar : 8.36% ( 15.6 MiB => 1.30 MiB, 175501.tar.7)
|
||||
# 175501.tar : 8.25% ( 15.6 MiB => 1.28 MiB, 175501.tar.8)
|
||||
# 175501.tar : 5.36% ( 15.6 MiB => 854 KiB, 175501.tar.9)
|
||||
# 175501.tar : 5.32% ( 15.6 MiB => 847 KiB, 175501.tar.10)
|
||||
# 175501.tar : 5.30% ( 15.6 MiB => 844 KiB, 175501.tar.11)
|
||||
# 175501.tar : 5.30% ( 15.6 MiB => 844 KiB, 175501.tar.12)
|
||||
# 175501.tar : 5.48% ( 15.6 MiB => 872 KiB, 175501.tar.13)
|
||||
# 175501.tar : 5.42% ( 15.6 MiB => 864 KiB, 175501.tar.14)
|
||||
# 175501.tar : 5.19% ( 15.6 MiB => 828 KiB, 175501.tar.15)
|
||||
# 175501.tar : 5.31% ( 15.6 MiB => 845 KiB, 175501.tar.16)
|
||||
# 175501.tar : 5.01% ( 15.6 MiB => 798 KiB, 175501.tar.17)
|
||||
# 175501.tar : 5.04% ( 15.6 MiB => 803 KiB, 175501.tar.18)
|
||||
# 175501.tar : 4.84% ( 15.6 MiB => 771 KiB, 175501.tar.19)
|
||||
# 175501.tar : 4.79% ( 15.6 MiB => 764 KiB, 175501.tar.20)
|
||||
# 175501.tar : 4.74% ( 15.6 MiB => 755 KiB, 175501.tar.21)
|
||||
# 175501.tar : 4.73% ( 15.6 MiB => 753 KiB, 175501.tar.22)
|
||||
|
File diff suppressed because one or more lines are too long
BIN
test_data/scraper/raw_scraped/ssd/1750369463.html.zst
Normal file
BIN
test_data/scraper/raw_scraped/ssd/1750369463.html.zst
Normal file
Binary file not shown.
Reference in New Issue
Block a user