Compare commits
6 Commits
b900e849d9
...
main
Author | SHA1 | Date | |
---|---|---|---|
461889ad2e | |||
448933ae67 | |||
bbca1f3bcb | |||
e3b018b046 | |||
91d8efe5bb | |||
6c6b766b72 |
95
Cargo.lock
generated
95
Cargo.lock
generated
@@ -515,6 +515,31 @@ dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-deque"
|
||||
version = "0.8.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
|
||||
dependencies = [
|
||||
"crossbeam-epoch",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-epoch"
|
||||
version = "0.9.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
|
||||
dependencies = [
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
||||
|
||||
[[package]]
|
||||
name = "crypto-common"
|
||||
version = "0.1.6"
|
||||
@@ -654,9 +679,8 @@ dependencies = [
|
||||
"chrono",
|
||||
"clap",
|
||||
"dirs",
|
||||
"env_logger",
|
||||
"lazy_static",
|
||||
"log",
|
||||
"rayon",
|
||||
"regex",
|
||||
"rusqlite",
|
||||
"scraper",
|
||||
@@ -674,6 +698,12 @@ version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8"
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.15.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
|
||||
|
||||
[[package]]
|
||||
name = "encode_unicode"
|
||||
version = "1.0.0"
|
||||
@@ -696,7 +726,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "186e05a59d4c50738528153b83b0b0194d3a29507dfec16eccd4b342903397d0"
|
||||
dependencies = [
|
||||
"log",
|
||||
"regex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -708,7 +737,6 @@ dependencies = [
|
||||
"anstream",
|
||||
"anstyle",
|
||||
"env_filter",
|
||||
"jiff",
|
||||
"log",
|
||||
]
|
||||
|
||||
@@ -1095,30 +1123,6 @@ version = "1.0.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
|
||||
|
||||
[[package]]
|
||||
name = "jiff"
|
||||
version = "0.2.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "be1f93b8b1eb69c77f24bbb0afdf66f54b632ee39af40ca21c4365a1d7347e49"
|
||||
dependencies = [
|
||||
"jiff-static",
|
||||
"log",
|
||||
"portable-atomic",
|
||||
"portable-atomic-util",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jiff-static"
|
||||
version = "0.2.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "03343451ff899767262ec32146f6d559dd759fdadf42ff0e227c7c48f72594b4"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jobserver"
|
||||
version = "0.1.33"
|
||||
@@ -1453,21 +1457,6 @@ version = "0.3.32"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c"
|
||||
|
||||
[[package]]
|
||||
name = "portable-atomic"
|
||||
version = "1.11.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483"
|
||||
|
||||
[[package]]
|
||||
name = "portable-atomic-util"
|
||||
version = "0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507"
|
||||
dependencies = [
|
||||
"portable-atomic",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "potential_utf"
|
||||
version = "0.1.2"
|
||||
@@ -1566,6 +1555,26 @@ dependencies = [
|
||||
"getrandom 0.3.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
|
||||
dependencies = [
|
||||
"either",
|
||||
"rayon-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon-core"
|
||||
version = "1.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
|
||||
dependencies = [
|
||||
"crossbeam-deque",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.5.13"
|
||||
|
@@ -8,17 +8,16 @@ actix-web = "4.11.0"
|
||||
chrono = { version = "0.4.41", features = ["serde"] }
|
||||
clap = { version = "4.5.40", features = ["derive"] }
|
||||
dirs = "6.0.0"
|
||||
env_logger = "0.11.8"
|
||||
lazy_static = "1.5.0"
|
||||
log = "0.4.27"
|
||||
rayon = "1.10.0"
|
||||
regex = "1.11.1"
|
||||
rusqlite = { version = "0.36.0", features = ["bundled", "chrono"] }
|
||||
scraper = "0.23.1"
|
||||
serde = { version = "1.0.219", features = ["derive"] }
|
||||
serde_json = "1.0.140"
|
||||
test-log = { version = "0.2.17", features = ["trace"] }
|
||||
tracing = "0.1.41"
|
||||
tracing-subscriber = "0.3.19"
|
||||
tracing = { version = "0.1.41", features = ["attributes"] }
|
||||
tracing-subscriber = { version = "0.3.19", features = ["env-filter"] }
|
||||
|
||||
[dev-dependencies]
|
||||
similar-asserts = "1.7.0"
|
||||
|
12
readme.md
Normal file
12
readme.md
Normal file
@@ -0,0 +1,12 @@
|
||||
# Scraper Web API
|
||||
|
||||
This is a dumb little tool which ingests raw HTML files, does some parsing on them, and serves the results over a web API.
|
||||
|
||||
```bash
|
||||
export URL_BASE="http://scraper.homelab.hak8or.com:8080"; \
|
||||
echo run0 && http POST "$URL_BASE/page/parse/ssd" && \
|
||||
echo run1 && http POST "$URL_BASE/listing/parse" && \
|
||||
echo run2 && http GET "$URL_BASE/listing/since/12345678/2" && \
|
||||
echo run3 && http GET "$URL_BASE/listing/388484391867" && \
|
||||
echo run4 && http GET "$URL_BASE/listing/286605201240/history"
|
||||
```
|
291
src/db.rs
291
src/db.rs
@@ -1,10 +1,11 @@
|
||||
use chrono::{DateTime, Utc};
|
||||
use rusqlite::Connection;
|
||||
use serde::Deserialize;
|
||||
use serde::Serialize;
|
||||
use std::path::Path;
|
||||
use tracing::info;
|
||||
use tracing::{error, info};
|
||||
|
||||
trait DBTable {
|
||||
pub trait DBTable {
|
||||
const TABLE_NAME: &'static str;
|
||||
const TABLE_SCHEMA: &'static str;
|
||||
fn initialize(conn: &Connection) {
|
||||
@@ -19,6 +20,20 @@ trait DBTable {
|
||||
info!("{} ({})", Self::TABLE_NAME, Self::TABLE_SCHEMA);
|
||||
conn.execute(create_table, ()).unwrap();
|
||||
}
|
||||
|
||||
fn get_count(conn: &Connection) -> i64 {
|
||||
let mut stmt = conn
|
||||
.prepare(&format!("SELECT COUNT(*) FROM {}", Self::TABLE_NAME))
|
||||
.ok()
|
||||
.unwrap();
|
||||
stmt.query_one([], |r| r.get(0))
|
||||
.inspect_err(|e| error!("Failed to get count due to error\"{:?}\", returning 0", e))
|
||||
.unwrap_or(0)
|
||||
}
|
||||
|
||||
fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>>
|
||||
where
|
||||
Self: Sized;
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, PartialEq, Clone)]
|
||||
@@ -32,6 +47,22 @@ impl DBTable for SearchURL {
|
||||
id INTEGER PRIMARY KEY,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
name TEXT NOT NULL UNIQUE";
|
||||
|
||||
fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> {
|
||||
let mut stmt = conn.prepare(&format!("SELECT url, name FROM {}", Self::TABLE_NAME))?;
|
||||
let iter = stmt.query_map([], |row| {
|
||||
Ok(SearchURL {
|
||||
full_url: row.get(0)?,
|
||||
name: row.get(1)?,
|
||||
})
|
||||
})?;
|
||||
|
||||
let mut result = Vec::new();
|
||||
for item in iter {
|
||||
result.push(item?);
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
impl SearchURL {
|
||||
pub fn lookup(conn: &Connection, name: &str) -> Option<Self> {
|
||||
@@ -75,6 +106,38 @@ impl SearchURL {
|
||||
.flatten()
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn scan(conn: &Connection, downloads_dir: &Path, filename: &str) {
|
||||
// Grab all directories.
|
||||
let dirs = std::fs::read_dir(downloads_dir)
|
||||
.unwrap()
|
||||
.filter_map(|e| Some(e.ok()?.path()))
|
||||
.filter(|e| e.is_dir());
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct URLJSON {
|
||||
url: String,
|
||||
}
|
||||
|
||||
// Grab url JSON's.
|
||||
for dir in dirs {
|
||||
let url_fpath = dir.join(filename);
|
||||
if !url_fpath.exists() {
|
||||
info!("Skipping {:?} as file does not exist", url_fpath);
|
||||
continue;
|
||||
}
|
||||
|
||||
let url_contents = std::fs::read_to_string(&url_fpath)
|
||||
.inspect_err(|e| error!("Failed reading {}: {e}", url_fpath.display()))
|
||||
.unwrap();
|
||||
let su = SearchURL {
|
||||
full_url: serde_json::from_str::<URLJSON>(&url_contents).unwrap().url,
|
||||
name: dir.file_name().unwrap().to_str().unwrap().to_owned(),
|
||||
};
|
||||
info!("Adding {:?} to search urls table", su);
|
||||
su.add_or_update(&conn);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, PartialEq, Clone)]
|
||||
@@ -91,9 +154,28 @@ impl DBTable for ParsedPage {
|
||||
UNIQUE(category, timestamp)
|
||||
FOREIGN KEY(category) REFERENCES SearchURLs(name)
|
||||
";
|
||||
|
||||
fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> {
|
||||
let mut stmt = conn.prepare(&format!(
|
||||
"SELECT category, timestamp FROM {}",
|
||||
Self::TABLE_NAME
|
||||
))?;
|
||||
let iter = stmt.query_map([], |row| {
|
||||
Ok(ParsedPage {
|
||||
category: row.get(0)?,
|
||||
timestamp: row.get(1)?,
|
||||
})
|
||||
})?;
|
||||
|
||||
let mut result = Vec::new();
|
||||
for item in iter {
|
||||
result.push(item?);
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
impl ParsedPage {
|
||||
pub fn lookup_db(conn: &Connection, timestamp: DateTime<Utc>) -> Option<Self> {
|
||||
pub fn lookup(conn: &Connection, timestamp: DateTime<Utc>) -> Option<Self> {
|
||||
let mut stmt = conn
|
||||
.prepare(&format!(
|
||||
"SELECT * FROM {} WHERE timestamp = ?",
|
||||
@@ -110,7 +192,7 @@ impl ParsedPage {
|
||||
.ok()
|
||||
}
|
||||
|
||||
pub fn add_or_update_db(&self, conn: &Connection) {
|
||||
pub fn add_or_update(&self, conn: &Connection) {
|
||||
let _ = conn
|
||||
.execute(
|
||||
&format!(
|
||||
@@ -146,11 +228,38 @@ impl DBTable for ParsedStorage {
|
||||
UNIQUE(item, parse_engine)
|
||||
FOREIGN KEY(item) REFERENCES Listings(item_id)
|
||||
";
|
||||
|
||||
fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> {
|
||||
let mut stmt = conn.prepare(&format!("SELECT id, item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, need_description_check FROM {}", Self::TABLE_NAME))?;
|
||||
let iter = stmt.query_map([], |row| {
|
||||
Ok(ParsedStorage {
|
||||
id: row.get(0)?,
|
||||
item: row.get(1)?,
|
||||
total_gigabytes: row.get(2)?,
|
||||
quantity: row.get(3)?,
|
||||
individual_size_gigabytes: {
|
||||
let r: String = row.get(4)?;
|
||||
r.parse().unwrap_or(0)
|
||||
},
|
||||
parse_engine: row.get(5)?,
|
||||
needed_description_check: row.get(6)?,
|
||||
})
|
||||
})?;
|
||||
|
||||
let mut result = Vec::new();
|
||||
for item in iter {
|
||||
result.push(item?);
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
impl ParsedStorage {
|
||||
pub fn lookup_db(conn: &Connection, item: i64) -> Vec<ParsedStorage> {
|
||||
pub fn lookup(conn: &Connection, item: i64) -> Vec<ParsedStorage> {
|
||||
let mut stmt = conn
|
||||
.prepare(&format!("SELECT * FROM {} WHERE id = ?", Self::TABLE_NAME))
|
||||
.prepare(&format!(
|
||||
"SELECT * FROM {} WHERE item = ?",
|
||||
Self::TABLE_NAME
|
||||
))
|
||||
.ok()
|
||||
.unwrap();
|
||||
stmt.query_map([item], |row| {
|
||||
@@ -173,7 +282,7 @@ impl ParsedStorage {
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn add_or_update_db(&self, conn: &Connection) {
|
||||
pub fn add_or_update(&self, conn: &Connection) {
|
||||
let _ = conn.execute(&format!("
|
||||
INSERT OR REPLACE INTO {}
|
||||
(item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, need_description_check)
|
||||
@@ -211,6 +320,27 @@ impl DBTable for ItemAppearances {
|
||||
FOREIGN KEY(item) REFERENCES Listings(item_id),
|
||||
FOREIGN KEY(category, timestamp) REFERENCES Pages_Parsed(category, timestamp)
|
||||
";
|
||||
|
||||
fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> {
|
||||
let mut stmt = conn.prepare(&format!(
|
||||
"SELECT item, category, timestamp, current_bid_usd_cents FROM {}",
|
||||
Self::TABLE_NAME
|
||||
))?;
|
||||
let iter = stmt.query_map([], |row| {
|
||||
Ok(ItemAppearances {
|
||||
item: row.get(0)?,
|
||||
category: row.get(1)?,
|
||||
timestamp: row.get(2)?,
|
||||
current_bid_usd_cents: row.get(3)?,
|
||||
})
|
||||
})?;
|
||||
|
||||
let mut result = Vec::new();
|
||||
for item in iter {
|
||||
result.push(item?);
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
impl ItemAppearances {
|
||||
pub fn add_or_update(&self, conn: &Connection) {
|
||||
@@ -267,7 +397,7 @@ pub struct Listing {
|
||||
pub id: i64,
|
||||
pub item_id: i64,
|
||||
pub title: String,
|
||||
pub buy_it_now_price: Option<f64>,
|
||||
pub buy_it_now_price_cents: Option<i64>,
|
||||
pub has_best_offer: bool,
|
||||
pub image_url: String,
|
||||
}
|
||||
@@ -281,6 +411,29 @@ impl DBTable for Listing {
|
||||
has_best_offer INTEGER NOT NULL,
|
||||
image_url TEXT NOT NULL
|
||||
";
|
||||
|
||||
fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> {
|
||||
let mut stmt = conn.prepare(&format!(
|
||||
"SELECT id, item_id, title, buy_it_now_usd_cents, has_best_offer, image_url FROM {}",
|
||||
Self::TABLE_NAME
|
||||
))?;
|
||||
let iter = stmt.query_map([], |row| {
|
||||
Ok(Listing {
|
||||
id: row.get(0)?,
|
||||
item_id: row.get(1)?,
|
||||
title: row.get(2)?,
|
||||
buy_it_now_price_cents: row.get(3)?,
|
||||
has_best_offer: row.get(4)?,
|
||||
image_url: row.get(5)?,
|
||||
})
|
||||
})?;
|
||||
|
||||
let mut result = Vec::new();
|
||||
for item in iter {
|
||||
result.push(item?);
|
||||
}
|
||||
Ok(result)
|
||||
}
|
||||
}
|
||||
impl Listing {
|
||||
pub fn lookup(conn: &Connection, item_id: i64) -> Option<Listing> {
|
||||
@@ -295,7 +448,7 @@ impl Listing {
|
||||
id: row.get(0)?,
|
||||
item_id: row.get(1)?,
|
||||
title: row.get(2)?,
|
||||
buy_it_now_price: row.get(3)?,
|
||||
buy_it_now_price_cents: row.get(3)?,
|
||||
has_best_offer: row.get(4)?,
|
||||
image_url: row.get(5)?,
|
||||
})
|
||||
@@ -316,20 +469,21 @@ impl Listing {
|
||||
{1}.item = {0}.item_id AND
|
||||
{1}.timestamp >= ?1
|
||||
)
|
||||
LIMIT ?2
|
||||
LIMIT {2}
|
||||
",
|
||||
Self::TABLE_NAME,
|
||||
ItemAppearances::TABLE_NAME
|
||||
ItemAppearances::TABLE_NAME,
|
||||
limit
|
||||
))
|
||||
.ok()
|
||||
.unwrap();
|
||||
|
||||
stmt.query_map([since.timestamp(), limit], |row| {
|
||||
stmt.query_map([since], |row| {
|
||||
Ok(Listing {
|
||||
id: row.get(0)?,
|
||||
item_id: row.get(1)?,
|
||||
title: row.get(2)?,
|
||||
buy_it_now_price: row.get(3)?,
|
||||
buy_it_now_price_cents: row.get(3)?,
|
||||
has_best_offer: row.get(4)?,
|
||||
image_url: row.get(5)?,
|
||||
})
|
||||
@@ -377,7 +531,7 @@ impl Listing {
|
||||
(
|
||||
self.item_id,
|
||||
&self.title,
|
||||
self.buy_it_now_price,
|
||||
self.buy_it_now_price_cents,
|
||||
self.has_best_offer,
|
||||
self.image_url.clone(),
|
||||
),
|
||||
@@ -389,6 +543,77 @@ impl Listing {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug)]
|
||||
pub struct ListingsFilterResult {
|
||||
listing: Listing,
|
||||
history: Vec<ItemAppearances>,
|
||||
parsed: Vec<ParsedStorage>,
|
||||
}
|
||||
|
||||
pub fn listings_get_filtered(
|
||||
conn: &Connection,
|
||||
since: &DateTime<Utc>,
|
||||
limit: i64,
|
||||
cents_per_tbytes_max: i64,
|
||||
) -> Vec<ListingsFilterResult> {
|
||||
// First grab all appearances seen since the timestamp including their
|
||||
// history and parsings.
|
||||
let listings_recent = Listing::lookup_since(conn, *since, 100_000)
|
||||
.into_iter()
|
||||
.map(|l| ListingsFilterResult {
|
||||
listing: l.clone(),
|
||||
history: ItemAppearances::lookup(conn, l.item_id),
|
||||
parsed: ParsedStorage::lookup(conn, l.item_id),
|
||||
})
|
||||
.filter(|lr| lr.parsed.iter().any(|p| !p.needed_description_check))
|
||||
.collect::<Vec<ListingsFilterResult>>();
|
||||
info!(
|
||||
"Found total {} listings since (str:{} epoch:{})",
|
||||
listings_recent.len(),
|
||||
*since,
|
||||
since.timestamp()
|
||||
);
|
||||
|
||||
// Then for each listing grab if within our price range.
|
||||
let listings: Vec<ListingsFilterResult> = listings_recent
|
||||
.into_iter()
|
||||
.filter_map(|l| {
|
||||
let mut history = l.history.clone();
|
||||
history.sort_by_key(|h| h.timestamp);
|
||||
// info!("item_id:{} history: {:?}", l.listing.item_id, history);
|
||||
let cents = history
|
||||
.last()
|
||||
.map(|h| h.current_bid_usd_cents)
|
||||
.unwrap_or(l.listing.buy_it_now_price_cents)?;
|
||||
// info!("item_id:{} cents: {:?}", l.listing.item_id, cents);
|
||||
let mut parses = l.parsed.clone();
|
||||
parses.sort_by_key(|p| p.parse_engine);
|
||||
// info!("item_id:{} parses: {:?}", l.listing.item_id, parses);
|
||||
let gb = parses.last()?.total_gigabytes;
|
||||
// info!("item_id:{} gb: {:?}", l.listing.item_id, gb);
|
||||
let usd_per_tb = (cents as f64 / 100.0) / (gb as f64 / 1024.0);
|
||||
// info!(
|
||||
// "item_id: {}, gb:{}, cents:{}, usd_per_tb:{}, cents_per_tbytes_max:{}",
|
||||
// l.listing.item_id, gb, cents, usd_per_tb, cents_per_tbytes_max
|
||||
// );
|
||||
if usd_per_tb >= (cents_per_tbytes_max as f64 / 100.0) {
|
||||
None
|
||||
} else {
|
||||
Some(l)
|
||||
}
|
||||
})
|
||||
.take(limit as usize)
|
||||
.collect();
|
||||
info!(
|
||||
"Found total {} listings since (str:{} epoch:{}) filtered by price",
|
||||
listings.len(),
|
||||
*since,
|
||||
since.timestamp()
|
||||
);
|
||||
|
||||
listings
|
||||
}
|
||||
|
||||
pub fn get_initialized(path: Option<&Path>) -> Connection {
|
||||
let conn = match path {
|
||||
Some(p) => Connection::open(&p),
|
||||
@@ -405,11 +630,30 @@ pub fn get_initialized(path: Option<&Path>) -> Connection {
|
||||
conn
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug)]
|
||||
pub struct Stats {
|
||||
rows_search_url: i64,
|
||||
rows_listing: i64,
|
||||
rows_parsed_storage: i64,
|
||||
rows_parsed_page: i64,
|
||||
rows_item_appearances: i64,
|
||||
}
|
||||
|
||||
pub fn get_stats(conn: &Connection) -> Stats {
|
||||
Stats {
|
||||
rows_search_url: SearchURL::get_count(conn),
|
||||
rows_listing: Listing::get_count(conn),
|
||||
rows_parsed_storage: ParsedStorage::get_count(conn),
|
||||
rows_parsed_page: ParsedPage::get_count(conn),
|
||||
rows_item_appearances: ItemAppearances::get_count(conn),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
#[test_log::test]
|
||||
fn sanity_check() {
|
||||
let db = get_initialized(None);
|
||||
|
||||
@@ -424,7 +668,7 @@ mod tests {
|
||||
id: 1,
|
||||
item_id: 1234,
|
||||
title: "Some Title".to_string(),
|
||||
buy_it_now_price: Some(1.23),
|
||||
buy_it_now_price_cents: Some(123),
|
||||
has_best_offer: false,
|
||||
image_url: "google.com".to_string(),
|
||||
};
|
||||
@@ -440,18 +684,15 @@ mod tests {
|
||||
parse_engine: 9,
|
||||
needed_description_check: true,
|
||||
};
|
||||
parsed.add_or_update_db(&db);
|
||||
assert_eq!(ParsedStorage::lookup_db(&db, listing.id), vec![parsed]);
|
||||
parsed.add_or_update(&db);
|
||||
assert_eq!(ParsedStorage::lookup(&db, listing.item_id), vec![parsed]);
|
||||
|
||||
let page = ParsedPage {
|
||||
category: "ssd".to_owned(),
|
||||
timestamp: std::time::SystemTime::now().into(),
|
||||
};
|
||||
page.add_or_update_db(&db);
|
||||
assert_eq!(
|
||||
ParsedPage::lookup_db(&db, page.timestamp),
|
||||
Some(page.clone())
|
||||
);
|
||||
page.add_or_update(&db);
|
||||
assert_eq!(ParsedPage::lookup(&db, page.timestamp), Some(page.clone()));
|
||||
|
||||
let apperance = ItemAppearances {
|
||||
item: listing.item_id,
|
||||
@@ -466,5 +707,9 @@ mod tests {
|
||||
);
|
||||
|
||||
assert_eq!(Listing::lookup_since(&db, page.timestamp, 3), vec![listing]);
|
||||
assert_eq!(
|
||||
Listing::lookup_since(&db, page.timestamp + chrono::Duration::seconds(1), 3),
|
||||
vec![]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
362
src/main.rs
362
src/main.rs
@@ -1,13 +1,22 @@
|
||||
use actix_web::{App, HttpServer, Responder, Result, get, post, web, web::Data};
|
||||
use chrono::DateTime;
|
||||
use chrono::{DateTime, Utc};
|
||||
use clap::Parser;
|
||||
use ebay_scraper_rust::db::{
|
||||
ItemAppearances, Listing, ParsedPage, ParsedStorage, SearchURL, get_initialized,
|
||||
DBTable, ItemAppearances, Listing, ParsedPage, ParsedStorage, SearchURL, get_initialized,
|
||||
get_stats, listings_get_filtered,
|
||||
};
|
||||
use ebay_scraper_rust::{parser_ebay, parser_storage};
|
||||
use std::path::{Path, PathBuf};
|
||||
use ebay_scraper_rust::parser::parse_dir;
|
||||
use ebay_scraper_rust::parser_storage;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Mutex;
|
||||
use tracing::info;
|
||||
use std::time::Instant;
|
||||
use tracing::{info, instrument};
|
||||
|
||||
use tracing_subscriber::filter::EnvFilter;
|
||||
use tracing_subscriber::fmt;
|
||||
use tracing_subscriber::prelude::__tracing_subscriber_SubscriberExt;
|
||||
use tracing_subscriber::util::SubscriberInitExt;
|
||||
|
||||
mod xdg_dirs;
|
||||
|
||||
@@ -19,28 +28,32 @@ mod xdg_dirs;
|
||||
)]
|
||||
struct Args {}
|
||||
|
||||
#[get("/page/{timestamp}")]
|
||||
async fn page_get(
|
||||
db: Data<Mutex<rusqlite::Connection>>,
|
||||
timestamp: web::Path<i64>,
|
||||
) -> Result<impl Responder> {
|
||||
Ok(web::Json(ParsedPage::lookup_db(
|
||||
&db.lock().unwrap(),
|
||||
chrono::DateTime::from_timestamp(*timestamp, 0).unwrap(),
|
||||
)))
|
||||
#[derive(Deserialize, Debug)]
|
||||
struct ListingsFilter {
|
||||
since: Option<i64>,
|
||||
limit: Option<i64>,
|
||||
cents_per_tbytes_max: Option<i64>,
|
||||
}
|
||||
|
||||
#[get("/listing/{id}/history")]
|
||||
async fn listing_history_get(
|
||||
#[get("/listings")]
|
||||
async fn listings_filtered_get(
|
||||
db: Data<Mutex<rusqlite::Connection>>,
|
||||
id: web::Path<i64>,
|
||||
filter: web::Query<ListingsFilter>,
|
||||
) -> Result<impl Responder> {
|
||||
let history: Vec<_> = ItemAppearances::lookup(&db.lock().unwrap(), *id)
|
||||
.iter()
|
||||
.inspect(|e| info!("got: {:?}", e))
|
||||
.filter_map(|e| Some((e.timestamp, e.current_bid_usd_cents?)))
|
||||
.collect();
|
||||
Ok(web::Json(history))
|
||||
let start = Instant::now();
|
||||
let res = listings_get_filtered(
|
||||
&db.lock().unwrap(),
|
||||
&DateTime::<Utc>::from_timestamp(filter.since.unwrap_or(0), 0).unwrap(),
|
||||
filter.limit.unwrap_or(1_000),
|
||||
filter.cents_per_tbytes_max.unwrap_or(100_00),
|
||||
);
|
||||
let elapsed = start.elapsed().as_micros() as f64 / 1000.0;
|
||||
info!(
|
||||
"Took {elapsed} milliseconds with {} listings found for a filter of {:?}",
|
||||
res.len(),
|
||||
filter
|
||||
);
|
||||
Ok(web::Json(res))
|
||||
}
|
||||
|
||||
#[get("/listing/{id}")]
|
||||
@@ -51,19 +64,39 @@ async fn listing_get(
|
||||
Ok(web::Json(Listing::lookup(&db.lock().unwrap(), *id)))
|
||||
}
|
||||
|
||||
#[get("/listing/since/{timestamp}/{limit}")]
|
||||
async fn listing_since_get(
|
||||
#[get("/listing/{id}/parsed")]
|
||||
async fn listing_parse_get(
|
||||
db: Data<Mutex<rusqlite::Connection>>,
|
||||
req: web::Path<(i64, i64)>,
|
||||
id: web::Path<i64>,
|
||||
) -> Result<impl Responder> {
|
||||
Ok(web::Json(Listing::lookup_since(
|
||||
&db.lock().unwrap(),
|
||||
DateTime::from_timestamp(req.0, 0).unwrap(),
|
||||
req.1,
|
||||
)))
|
||||
Ok(web::Json(ParsedStorage::lookup(&db.lock().unwrap(), *id)))
|
||||
}
|
||||
|
||||
#[post("listing/parse")]
|
||||
#[derive(Serialize)]
|
||||
struct APIHistory {
|
||||
when: DateTime<Utc>,
|
||||
current_bid_usd_cents: i64,
|
||||
}
|
||||
|
||||
#[get("/listing/{id}/history")]
|
||||
async fn listing_history_get(
|
||||
db: Data<Mutex<rusqlite::Connection>>,
|
||||
id: web::Path<i64>,
|
||||
) -> Result<impl Responder> {
|
||||
let history: Vec<_> = ItemAppearances::lookup(&db.lock().unwrap(), *id)
|
||||
.iter()
|
||||
// .inspect(|e| info!("got: {:?}", e))
|
||||
.filter_map(|e| {
|
||||
Some(APIHistory {
|
||||
when: e.timestamp,
|
||||
current_bid_usd_cents: e.current_bid_usd_cents?,
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
Ok(web::Json(history))
|
||||
}
|
||||
|
||||
#[post("/listing/parse")]
|
||||
async fn parse_listings(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
|
||||
let mut cnt = 0;
|
||||
let db_unlocked = db.lock().unwrap();
|
||||
@@ -71,139 +104,212 @@ async fn parse_listings(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Re
|
||||
.iter()
|
||||
.map(|l| parser_storage::parse_size_and_quantity(l.0, &l.1))
|
||||
.inspect(|_| cnt = cnt + 1)
|
||||
.for_each(|ps| ps.add_or_update_db(&db_unlocked));
|
||||
.for_each(|ps| ps.add_or_update(&db_unlocked));
|
||||
|
||||
Ok(web::Json(cnt))
|
||||
}
|
||||
|
||||
#[get("listing/parse/{id}")]
|
||||
async fn listing_parse_get(
|
||||
db: Data<Mutex<rusqlite::Connection>>,
|
||||
id: web::Path<i64>,
|
||||
) -> Result<impl Responder> {
|
||||
Ok(web::Json(ParsedStorage::lookup_db(
|
||||
&db.lock().unwrap(),
|
||||
*id,
|
||||
)))
|
||||
#[get("/category")]
|
||||
async fn category_getnames(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
|
||||
Ok(web::Json(SearchURL::names(&db.lock().unwrap())))
|
||||
}
|
||||
|
||||
pub fn timestamps_from_dir(path: &Path) -> Vec<i64> {
|
||||
if !std::fs::exists(path).expect("Directory must exist") {
|
||||
panic!(
|
||||
"Directory {:?} does not exist, cannot grab timestamps from there.",
|
||||
path
|
||||
);
|
||||
}
|
||||
|
||||
std::fs::read_dir(path)
|
||||
.unwrap()
|
||||
.inspect(|fpath| info!("Found {:?}", fpath))
|
||||
.map(|fpath| fpath.unwrap().path())
|
||||
.filter_map(|fstem| {
|
||||
fstem
|
||||
.file_stem()
|
||||
.and_then(|s| s.to_str())
|
||||
.expect("Invalid file name")
|
||||
.parse()
|
||||
.ok()
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[post("page/parse/{category}")]
|
||||
async fn parse_post(
|
||||
#[post("/category/{category}/parse")]
|
||||
#[instrument(skip_all)]
|
||||
async fn category_parse(
|
||||
db: Data<Mutex<rusqlite::Connection>>,
|
||||
downloaddir: Data<PathBuf>,
|
||||
category: web::Path<String>,
|
||||
) -> Result<impl Responder> {
|
||||
let dir = &downloaddir.join(category.clone());
|
||||
|
||||
// Ensure the category is created.
|
||||
let url: serde_json::Value =
|
||||
serde_json::from_str(&std::fs::read_to_string(dir.join("url.json")).unwrap()).unwrap();
|
||||
info!("{:?}", url);
|
||||
let su = SearchURL {
|
||||
full_url: url.to_string(),
|
||||
name: category.to_string(),
|
||||
};
|
||||
info!("{:?}", su);
|
||||
su.add_or_update(&db.lock().unwrap());
|
||||
|
||||
let added: u64 = timestamps_from_dir(dir)
|
||||
.iter()
|
||||
.filter(|t| {
|
||||
info!("Checking for the existance of page {t}");
|
||||
let p = ParsedPage::lookup_db(
|
||||
let start = Instant::now();
|
||||
let count = parse_dir(
|
||||
&downloaddir.join(category.clone()),
|
||||
&category,
|
||||
&db.lock().unwrap(),
|
||||
chrono::DateTime::from_timestamp(**t, 0).unwrap(),
|
||||
)
|
||||
.unwrap();
|
||||
let elapsed = start.elapsed().as_micros() as f64 / 1000.0;
|
||||
|
||||
info!("Added {count} listings, took {elapsed} ms.");
|
||||
Ok(count.to_string())
|
||||
}
|
||||
|
||||
#[get("/stats")]
|
||||
async fn stats_get(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
|
||||
Ok(web::Json(get_stats(&db.lock().unwrap())))
|
||||
}
|
||||
|
||||
#[get("/admin")]
|
||||
async fn admin_get(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
|
||||
let db = db.lock().unwrap();
|
||||
let query_start_time = Instant::now();
|
||||
let search_urls = SearchURL::get_all(&db).unwrap_or_default();
|
||||
let parsed_pages = ParsedPage::get_all(&db).unwrap_or_default();
|
||||
let parsed_storages = ParsedStorage::get_all(&db).unwrap_or_default();
|
||||
let item_appearances = ItemAppearances::get_all(&db).unwrap_or_default();
|
||||
let listings = Listing::get_all(&db).unwrap_or_default();
|
||||
let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
|
||||
|
||||
let html_gen_start_time = Instant::now();
|
||||
let mut html = String::new();
|
||||
html.push_str(
|
||||
r#"<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Database Dump</title>
|
||||
<link rel="stylesheet" href="https://unpkg.com/purecss@2.0.6/build/pure-min.css" xintegrity="sha384-Uu6IeWbM+gzNVXJcM9XV3SohHtmWE+3VGi496jvgX1jyvDTWuaAUiIEoIeVVERG" crossorigin="anonymous">
|
||||
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery.tablesorter/2.31.3/js/jquery.tablesorter.min.js"></script>
|
||||
<style>
|
||||
body { padding: 1em; }
|
||||
.pure-table { margin-bottom: 2em; }
|
||||
th { cursor: pointer; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Database Dump</h1>
|
||||
"#,
|
||||
);
|
||||
|
||||
// Timestamp never seen before, lets pass it on.
|
||||
if p.is_none() {
|
||||
return true;
|
||||
}
|
||||
html.push_str(&generate_table("SearchURLs", &search_urls));
|
||||
html.push_str(&generate_table("Pages_Parsed", &parsed_pages));
|
||||
html.push_str(&generate_table("Storage_Parsed", &parsed_storages));
|
||||
html.push_str(&generate_table("Item_Appearances", &item_appearances));
|
||||
html.push_str(&generate_table("Listings", &listings));
|
||||
|
||||
// Timestamp was seen before *and* from the same catagory, don't pass
|
||||
// it on.
|
||||
if p.unwrap().category == *category {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
})
|
||||
.map(|t| {
|
||||
let timestamp = chrono::DateTime::from_timestamp(*t, 0).unwrap();
|
||||
info!("Adding or updating db with timestamp:{timestamp} catagory:{category}");
|
||||
ParsedPage {
|
||||
timestamp: timestamp,
|
||||
category: category.to_string(),
|
||||
}
|
||||
.add_or_update_db(&db.lock().unwrap());
|
||||
// Performance Metrics
|
||||
let html_gen_time = html_gen_start_time.elapsed().as_micros() as f64 / 1000.0;
|
||||
html.push_str(&format!(
|
||||
r#"<p>
|
||||
Database query time: <strong>{}ms</strong><br>
|
||||
HTML generation time: <strong>{}ms</strong>
|
||||
</p>"#,
|
||||
total_query_time, html_gen_time
|
||||
));
|
||||
info!("DB Query ms: {total_query_time}, HTML Generation ms:{html_gen_time}");
|
||||
|
||||
let mut cnt = 0;
|
||||
parser_ebay::extract_data_from_html(
|
||||
&std::fs::read_to_string(dir.join(format!("{t}.html"))).unwrap(),
|
||||
×tamp,
|
||||
&category,
|
||||
)
|
||||
.unwrap()
|
||||
.iter()
|
||||
.for_each(|lp| {
|
||||
cnt = cnt + 1;
|
||||
lp.0.add_or_update(&db.lock().unwrap());
|
||||
lp.1.add_or_update(&db.lock().unwrap());
|
||||
info!("Inserting id:{}, title:{}", lp.0.item_id, lp.0.title);
|
||||
// Footer and Scripts
|
||||
html.push_str(
|
||||
r#"
|
||||
<script>
|
||||
$(function() {
|
||||
$(".sortable-table").tablesorter();
|
||||
});
|
||||
cnt
|
||||
})
|
||||
.sum();
|
||||
</script>
|
||||
</body>
|
||||
</html>"#,
|
||||
);
|
||||
|
||||
Ok(added.to_string())
|
||||
Ok(web::Html::new(&html))
|
||||
}
|
||||
|
||||
fn generate_table<T: Serialize>(title: &str, data: &[T]) -> String {
|
||||
use serde_json::Value;
|
||||
|
||||
if data.is_empty() {
|
||||
return format!(
|
||||
"<h2>{} (0 rows)</h2><table class='pure-table pure-table-bordered pure-table-striped sortable-table'><thead><tr></tr></thead><tbody></tbody></table>",
|
||||
title
|
||||
);
|
||||
}
|
||||
|
||||
let mut headers: Vec<String> = serde_json::to_value(&data[0])
|
||||
.unwrap_or(Value::Null)
|
||||
.as_object()
|
||||
.map_or(Vec::new(), |obj| obj.keys().cloned().collect());
|
||||
|
||||
// Define the desired order for specific columns.
|
||||
let desired_order = ["id", "item", "item_id", "timestamp"];
|
||||
|
||||
// Sort the headers. Columns in `desired_order` come first,
|
||||
// in that order. The rest are sorted alphabetically.
|
||||
headers.sort_by(|a, b| {
|
||||
let a_pos = desired_order
|
||||
.iter()
|
||||
.position(|&p| p == a)
|
||||
.unwrap_or(usize::MAX);
|
||||
let b_pos = desired_order
|
||||
.iter()
|
||||
.position(|&p| p == b)
|
||||
.unwrap_or(usize::MAX);
|
||||
a_pos.cmp(&b_pos).then_with(|| a.cmp(b))
|
||||
});
|
||||
|
||||
// Create the HTML for the table header row.
|
||||
let header_html = headers
|
||||
.iter()
|
||||
.map(|header| format!("<th>{}</th>", header))
|
||||
.collect::<String>();
|
||||
|
||||
// Create the HTML for all the table body rows.
|
||||
let body_html = data
|
||||
.iter()
|
||||
.map(|item| {
|
||||
let item_json = serde_json::to_value(item).unwrap_or(Value::Null);
|
||||
let obj = item_json.as_object();
|
||||
|
||||
// Create all cells for a single row.
|
||||
let cells_html = headers
|
||||
.iter()
|
||||
.map(|header| {
|
||||
let value = obj.and_then(|o| o.get(header)).unwrap_or(&Value::Null);
|
||||
// Remove quotes from the resulting JSON string value for cleaner output.
|
||||
format!("<td>{}</td>", value.to_string().replace('"', ""))
|
||||
})
|
||||
.collect::<String>();
|
||||
|
||||
format!("<tr>{}</tr>", cells_html)
|
||||
})
|
||||
.collect::<String>();
|
||||
|
||||
// Assemble the final table HTML.
|
||||
format!(
|
||||
"<h2>{} ({} rows)</h2><table class='pure-table pure-table-bordered pure-table-striped sortable-table'><thead><tr>{}</tr></thead><tbody>{}</tbody></table>",
|
||||
title,
|
||||
data.len(),
|
||||
header_html,
|
||||
body_html
|
||||
)
|
||||
}
|
||||
|
||||
#[actix_web::main]
|
||||
async fn main() -> std::io::Result<()> {
|
||||
env_logger::init_from_env(env_logger::Env::new().default_filter_or("info"));
|
||||
tracing_subscriber::registry()
|
||||
.with(fmt::layer())
|
||||
.with(EnvFilter::from_default_env())
|
||||
.init();
|
||||
let _ = Args::parse();
|
||||
|
||||
let scrapedatadir = xdg_dirs::ensure_scrapedata_dir_exists("ebay_scraper", None);
|
||||
let scrapedatadir = xdg_dirs::ensure_scrapedata_dir_exists("scraper", None);
|
||||
info!(
|
||||
"Starting with scraped data dir of \"{}\".",
|
||||
scrapedatadir.to_str().unwrap()
|
||||
);
|
||||
let db_mutex = Data::new(Mutex::new(get_initialized(None)));
|
||||
|
||||
// Prepare our backend via pulling in what catagories we are preconfigured with.
|
||||
SearchURL::scan(&db_mutex.lock().unwrap(), &scrapedatadir, "url.json");
|
||||
|
||||
HttpServer::new(move || {
|
||||
App::new()
|
||||
.service(page_get)
|
||||
// Listing handlers
|
||||
.service(listing_get)
|
||||
.service(listings_filtered_get)
|
||||
.service(listing_history_get)
|
||||
.service(listing_since_get)
|
||||
.service(parse_post)
|
||||
// Category handlers
|
||||
.service(parse_listings)
|
||||
.service(category_parse)
|
||||
.service(category_getnames)
|
||||
// Gnarly info dump
|
||||
.service(admin_get)
|
||||
.service(stats_get)
|
||||
// Stuff which is passed into every request.
|
||||
.app_data(db_mutex.clone())
|
||||
.app_data(Data::new(scrapedatadir.clone()))
|
||||
})
|
||||
.bind(("127.0.0.1", 8080))?
|
||||
.bind(("0.0.0.0", 9876))?
|
||||
.run()
|
||||
.await
|
||||
}
|
||||
|
139
src/parser.rs
139
src/parser.rs
@@ -1 +1,140 @@
|
||||
use crate::{
|
||||
db::{ParsedPage, SearchURL},
|
||||
parser_ebay,
|
||||
};
|
||||
use rayon::prelude::*;
|
||||
use serde::Deserialize;
|
||||
use serde_json;
|
||||
use std::path::Path;
|
||||
use std::time::Instant;
|
||||
use tracing::{debug, error, info};
|
||||
|
||||
fn timestamps_from_dir(path: &Path) -> Vec<i64> {
|
||||
if !std::fs::exists(path).expect("Directory must exist") {
|
||||
panic!(
|
||||
"Directory {:?} does not exist, cannot grab timestamps from there.",
|
||||
path
|
||||
);
|
||||
}
|
||||
|
||||
std::fs::read_dir(path)
|
||||
.unwrap()
|
||||
.map(|fpath| fpath.unwrap().path())
|
||||
.filter_map(|fstem| {
|
||||
fstem
|
||||
.file_stem()
|
||||
.and_then(|s| s.to_str())
|
||||
.expect("Invalid file name")
|
||||
.parse()
|
||||
.ok()
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn parse_dir(dir: &Path, category: &str, db: &rusqlite::Connection) -> Option<usize> {
|
||||
// Ensure the category is created.
|
||||
let url_fpath = dir.join("url.json");
|
||||
let url_contents = std::fs::read_to_string(&url_fpath)
|
||||
.inspect_err(|e| error!("Failed reading {}: {e}", url_fpath.display()))
|
||||
.ok()?;
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct URLJSON {
|
||||
url: String,
|
||||
}
|
||||
let su = SearchURL {
|
||||
full_url: serde_json::from_str::<URLJSON>(&url_contents).unwrap().url,
|
||||
name: category.to_string(),
|
||||
};
|
||||
su.add_or_update(&db);
|
||||
|
||||
// See all pages haven't been seen before.
|
||||
let query_start_time = Instant::now();
|
||||
let to_parse = timestamps_from_dir(dir).into_iter().filter(|t| {
|
||||
let ts = chrono::DateTime::from_timestamp(*t, 0).unwrap();
|
||||
let p = ParsedPage::lookup(&db, ts);
|
||||
|
||||
// Timestamp never seen before, lets pass it on.
|
||||
if p.is_none() {
|
||||
info!(
|
||||
"Page Timestamp:{} Catagory:{category} never seen before, processing ...",
|
||||
ts.timestamp()
|
||||
);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Timestamp was seen before *and* from the same catagory, don't pass
|
||||
// it on.
|
||||
if p.unwrap().category == *category {
|
||||
info!(
|
||||
"Page Timestamp:{} Catagory:{category} seen before, skipping ...",
|
||||
ts.timestamp()
|
||||
);
|
||||
return false;
|
||||
}
|
||||
|
||||
info!(
|
||||
"Page Timestamp:{} Catagory:{category} seen before, but not of catagory:{category}, processing ...",
|
||||
ts.timestamp()
|
||||
);
|
||||
return true;
|
||||
}).collect::<Vec<_>>();
|
||||
let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
|
||||
info!("Time spent finding pages to parse:{total_query_time} ms");
|
||||
|
||||
// For each page, read the file and parse it.
|
||||
let query_start_time = Instant::now();
|
||||
let to_add = to_parse
|
||||
.par_iter()
|
||||
.map(|p| {
|
||||
let ts = chrono::DateTime::from_timestamp(*p, 0).unwrap();
|
||||
let paged_info = ParsedPage {
|
||||
timestamp: ts,
|
||||
category: category.to_string(),
|
||||
};
|
||||
|
||||
let page_path = dir.join(format!("{}.html", ts.timestamp()));
|
||||
let page_contents = std::fs::read_to_string(&page_path)
|
||||
.inspect_err(|e| error!("Failed reading {}, error:{e}", page_path.display()))
|
||||
.ok()?;
|
||||
let elements =
|
||||
parser_ebay::parse_from_ebay_page(&page_contents, &ts, &category).unwrap();
|
||||
info!(
|
||||
"Page Timestamp:{} Catagory:{category}, found {} elements",
|
||||
ts.timestamp(),
|
||||
elements.len()
|
||||
);
|
||||
|
||||
Some((paged_info, elements))
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
|
||||
info!("Time spent reading and parsing pages: {total_query_time} ms");
|
||||
|
||||
// And lastly add it to our database!
|
||||
let query_start_time = Instant::now();
|
||||
let mut added_count = 0;
|
||||
for iter in to_add {
|
||||
if iter.is_none() {
|
||||
continue;
|
||||
}
|
||||
let (paged_info, elements) = iter.unwrap();
|
||||
paged_info.add_or_update(&db);
|
||||
|
||||
for e in elements {
|
||||
added_count += 1;
|
||||
e.0.add_or_update(&db);
|
||||
e.1.add_or_update(&db);
|
||||
debug!(
|
||||
"Page Timestamp:{} Catagory:{category}, inserting id:{}, title:{}",
|
||||
paged_info.timestamp.timestamp(),
|
||||
e.0.item_id,
|
||||
e.0.title
|
||||
);
|
||||
}
|
||||
}
|
||||
let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
|
||||
info!("Time spent adding parsed pages: {total_query_time} ms");
|
||||
|
||||
return Some(added_count);
|
||||
}
|
||||
|
@@ -18,7 +18,6 @@ fn parse_price(price_text: &str) -> Option<f64> {
|
||||
if let Some(first_part) = lower_price_text.split(" to ").next() {
|
||||
if let Some(caps) = PRICE_REGEX.captures(first_part) {
|
||||
if let Some(price_match) = caps.get(1) {
|
||||
info!("Price string:{:?} parsed!", price_match);
|
||||
return price_match.as_str().replace(',', "").parse().ok();
|
||||
}
|
||||
}
|
||||
@@ -49,7 +48,7 @@ fn parse_price(price_text: &str) -> Option<f64> {
|
||||
}
|
||||
|
||||
/// Extracts item data from HTML content.
|
||||
pub fn extract_data_from_html(
|
||||
pub fn parse_from_ebay_page(
|
||||
html_content: &str,
|
||||
timestamp: &chrono::DateTime<Utc>,
|
||||
category: &str,
|
||||
@@ -98,7 +97,7 @@ pub fn extract_data_from_html(
|
||||
continue;
|
||||
}
|
||||
if id.unwrap() == 123456 {
|
||||
info!("Skipping {:?} due to bogus ID of 123456", element);
|
||||
info!("Skipping element due to bogus ID of 123456");
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -154,7 +153,7 @@ pub fn extract_data_from_html(
|
||||
title,
|
||||
id: 0,
|
||||
item_id: id?,
|
||||
buy_it_now_price: final_buy_it_now_price,
|
||||
buy_it_now_price_cents: final_buy_it_now_price.map(|b| (b * 100.0).round() as i64),
|
||||
has_best_offer,
|
||||
image_url,
|
||||
},
|
||||
@@ -177,8 +176,8 @@ mod tests {
|
||||
#[test_log::test]
|
||||
fn parse() {
|
||||
let timestamp = chrono::DateTime::from_timestamp(1750369463, 0).unwrap();
|
||||
let html = include_str!("../test_data/ebay_scraper/raw_scraped/ssd/1750369463.html");
|
||||
let parsed = extract_data_from_html(html, ×tamp, "ssd").unwrap();
|
||||
let html = include_str!("../test_data/scraper/raw_scraped/ssd/1750369463.html");
|
||||
let parsed = parse_from_ebay_page(html, ×tamp, "ssd").unwrap();
|
||||
// assert_eq!(parsed.len(), 62);
|
||||
|
||||
let parsed = parsed.first_chunk::<10>().unwrap();
|
||||
@@ -189,7 +188,7 @@ mod tests {
|
||||
id: 0,
|
||||
item_id: 388484391867,
|
||||
title: "WD Blue 2.5-Inch 3D NAND SATA SSD 1TB - WDBNCE0010PNC-WRSN".to_string(),
|
||||
buy_it_now_price: Some(59.99),
|
||||
buy_it_now_price_cents: Some(5999),
|
||||
has_best_offer: true,
|
||||
image_url: "https://i.ebayimg.com/images/g/wQYAAeSwOTtoN8SC/s-l500.webp"
|
||||
.to_string()
|
||||
@@ -210,7 +209,7 @@ mod tests {
|
||||
title:
|
||||
"Fanxiang M.2 SSD 1TB NVMe PCIe Gen 3x 4 M2 Internal Solid State Drive 3500MB/s"
|
||||
.to_string(),
|
||||
buy_it_now_price: None,
|
||||
buy_it_now_price_cents: None,
|
||||
has_best_offer: true,
|
||||
image_url: "https://i.ebayimg.com/images/g/3NoAAeSwPrtoDb1O/s-l500.webp"
|
||||
.to_string()
|
||||
|
11
systemd/scraper_webapi.service
Normal file
11
systemd/scraper_webapi.service
Normal file
@@ -0,0 +1,11 @@
|
||||
[Unit]
|
||||
Description=Run a single instance of a the web api
|
||||
After=syslog.target network.target
|
||||
|
||||
[Service]
|
||||
Type=exec
|
||||
#Environment=XDG_DATA_HOME=/home/hak8or/code/ebay_scraper_rust/.tmp_run
|
||||
ExecStart=/usr/local/bin/ebay_scraper_rust
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
@@ -1,59 +0,0 @@
|
||||
--2025-06-22 20:08:55-- https://www.ebay.com/sch/i.html?&_nkw=&_sacat=179&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240
|
||||
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
|
||||
Resolving www.ebay.com (www.ebay.com)... 23.56.163.160
|
||||
Connecting to www.ebay.com (www.ebay.com)|23.56.163.160|:443... connected.
|
||||
HTTP request sent, awaiting response... 200 OK
|
||||
Length: unspecified [text/html]
|
||||
Saving to: ‘i.html?&_nkw=&_sacat=179&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240’
|
||||
|
||||
0K .......... .......... .......... .......... .......... 6.28M
|
||||
50K .......... .......... .......... .......... .......... 76.1K
|
||||
100K .......... .......... .......... .......... .......... 18.6M
|
||||
150K .......... .......... .......... .......... .......... 12.7M
|
||||
200K .......... .......... .......... .......... .......... 34.4M
|
||||
250K .......... .......... .......... .......... .......... 25.0M
|
||||
300K .......... .......... .......... .......... .......... 41.3M
|
||||
350K .......... .......... .......... .......... .......... 114M
|
||||
400K .......... .......... .......... .......... .......... 73.4M
|
||||
450K .......... .......... .......... .......... .......... 33.5M
|
||||
500K .......... .......... .......... .......... .......... 50.2M
|
||||
550K .......... .......... .......... .......... .......... 76.2M
|
||||
600K .......... .......... .......... .......... .......... 109M
|
||||
650K .......... .......... .......... .......... .......... 61.5M
|
||||
700K .......... .......... .......... .......... .......... 81.1M
|
||||
750K .......... .......... .......... .......... .......... 337M
|
||||
800K .......... .......... .......... .......... .......... 118M
|
||||
850K .......... .......... .......... .......... .......... 85.5M
|
||||
900K .......... .......... .......... .......... .......... 92.6M
|
||||
950K .......... .......... .......... .......... .......... 96.7M
|
||||
1000K .......... .......... .......... .......... .......... 84.6M
|
||||
1050K .......... .......... .......... .......... .......... 500M
|
||||
1100K .......... .......... .......... .......... .......... 109M
|
||||
1150K .......... .......... .......... .......... .......... 83.5M
|
||||
1200K .......... .......... .......... .......... .......... 160M
|
||||
1250K .......... .......... .......... .......... .......... 141M
|
||||
1300K .......... .......... .......... .......... .......... 41.7M
|
||||
1350K .......... .......... .......... .......... .......... 96.4M
|
||||
1400K .......... .......... .......... .......... .......... 2.47M
|
||||
1450K .......... .......... .......... .......... .......... 36.6M
|
||||
1500K .......... .......... .......... .......... .......... 83.5M
|
||||
1550K .......... .......... .......... .......... .......... 71.7M
|
||||
1600K .......... .......... .......... .......... .......... 37.7M
|
||||
1650K .......... .......... .......... .......... .......... 104M
|
||||
1700K .......... .......... .......... .......... .......... 73.7M
|
||||
1750K .......... .......... .......... .......... .......... 115M
|
||||
1800K .......... .......... .......... .......... .......... 85.3M
|
||||
1850K .......... .......... .......... .......... .......... 140M
|
||||
1900K .......... .......... .......... .......... .......... 71.1M
|
||||
1950K .......... .......... .......... .......... .......... 112M
|
||||
2000K .......... .......... .......... .......... .......... 75.4M
|
||||
2050K .......... .......... .......... .......... .......... 120M
|
||||
2100K .......... .......... .......... .......... .......... 112M
|
||||
2150K .......... .......... .......... .......... .......... 117M
|
||||
2200K .......... .......... .......... .......... .......... 108M
|
||||
2250K .......... .......... .......... .......... .......... 97.1M
|
||||
2300K .......... .......... .......... .......... .......... 31.8M
|
||||
2350K ...... 11.4T=0.7s
|
||||
|
||||
2025-06-22 20:08:56 (3.20 MB/s) - ‘i.html?&_nkw=&_sacat=179&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240’ saved [2412662]
|
||||
|
@@ -1,63 +0,0 @@
|
||||
--2025-06-22 20:08:17-- https://www.ebay.com/sch/i.html?&_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240
|
||||
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
|
||||
Resolving www.ebay.com (www.ebay.com)... 23.56.163.160
|
||||
Connecting to www.ebay.com (www.ebay.com)|23.56.163.160|:443... connected.
|
||||
HTTP request sent, awaiting response... 200 OK
|
||||
Length: unspecified [text/html]
|
||||
Saving to: ‘i.html?&_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240’
|
||||
|
||||
0K .......... .......... .......... .......... .......... 8.04M
|
||||
50K .......... .......... .......... .......... .......... 83.3K
|
||||
100K .......... .......... .......... .......... .......... 1.38M
|
||||
150K .......... .......... .......... .......... .......... 7.12M
|
||||
200K .......... .......... .......... .......... .......... 18.8M
|
||||
250K .......... .......... .......... .......... .......... 18.0M
|
||||
300K .......... .......... .......... .......... .......... 19.4M
|
||||
350K .......... .......... .......... .......... .......... 48.4M
|
||||
400K .......... .......... .......... .......... .......... 45.9M
|
||||
450K .......... .......... .......... .......... .......... 50.4M
|
||||
500K .......... .......... .......... .......... .......... 50.1M
|
||||
550K .......... .......... .......... .......... .......... 119M
|
||||
600K .......... .......... .......... .......... .......... 45.4M
|
||||
650K .......... .......... .......... .......... .......... 44.1M
|
||||
700K .......... .......... .......... .......... .......... 59.1M
|
||||
750K .......... .......... .......... .......... .......... 84.0M
|
||||
800K .......... .......... .......... .......... .......... 167M
|
||||
850K .......... .......... .......... .......... .......... 76.6M
|
||||
900K .......... .......... .......... .......... .......... 59.4M
|
||||
950K .......... .......... .......... .......... .......... 60.3M
|
||||
1000K .......... .......... .......... .......... .......... 113M
|
||||
1050K .......... .......... .......... .......... .......... 592M
|
||||
1100K .......... .......... .......... .......... .......... 53.9M
|
||||
1150K .......... .......... .......... .......... .......... 101M
|
||||
1200K .......... .......... .......... .......... .......... 91.9M
|
||||
1250K .......... .......... .......... .......... .......... 108M
|
||||
1300K .......... .......... .......... .......... .......... 85.2M
|
||||
1350K .......... .......... .......... .......... .......... 96.9M
|
||||
1400K .......... .......... .......... .......... .......... 93.5M
|
||||
1450K .......... .......... .......... .......... .......... 51.2M
|
||||
1500K .......... .......... .......... .......... .......... 69.9M
|
||||
1550K .......... .......... .......... .......... .......... 654M
|
||||
1600K .......... .......... .......... .......... .......... 185M
|
||||
1650K .......... .......... .......... .......... .......... 9.94M
|
||||
1700K .......... .......... .......... .......... .......... 27.5M
|
||||
1750K .......... .......... .......... .......... .......... 613M
|
||||
1800K .......... .......... .......... .......... .......... 659M
|
||||
1850K .......... .......... .......... .......... .......... 21.3M
|
||||
1900K .......... .......... .......... .......... .......... 107M
|
||||
1950K .......... .......... .......... .......... .......... 158M
|
||||
2000K .......... .......... .......... .......... .......... 37.8M
|
||||
2050K .......... .......... .......... .......... .......... 85.2M
|
||||
2100K .......... .......... .......... .......... .......... 26.0M
|
||||
2150K .......... .......... .......... .......... .......... 57.1M
|
||||
2200K .......... .......... .......... .......... .......... 114M
|
||||
2250K .......... .......... .......... .......... .......... 117M
|
||||
2300K .......... .......... .......... .......... .......... 57.9M
|
||||
2350K .......... .......... .......... .......... .......... 127M
|
||||
2400K .......... .......... .......... .......... .......... 118M
|
||||
2450K .......... .......... .......... .......... .......... 62.1M
|
||||
2500K .......... .......... .......... .......... .......... 157M
|
||||
2550K 723G=0.7s
|
||||
|
||||
2025-06-22 20:08:18 (3.60 MB/s) - ‘i.html?&_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240’ saved [2611588]
|
||||
|
@@ -1,63 +0,0 @@
|
||||
--2025-06-22 20:08:54-- https://www.ebay.com/sch/i.html?&_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240
|
||||
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
|
||||
Resolving www.ebay.com (www.ebay.com)... 23.56.163.160
|
||||
Connecting to www.ebay.com (www.ebay.com)|23.56.163.160|:443... connected.
|
||||
HTTP request sent, awaiting response... 200 OK
|
||||
Length: unspecified [text/html]
|
||||
Saving to: ‘i.html?&_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240.2’
|
||||
|
||||
0K .......... .......... .......... .......... .......... 4.98M
|
||||
50K .......... .......... .......... .......... .......... 75.7K
|
||||
100K .......... .......... .......... .......... .......... 4.26M
|
||||
150K .......... .......... .......... .......... .......... 10.1M
|
||||
200K .......... .......... .......... .......... .......... 25.3M
|
||||
250K .......... .......... .......... .......... .......... 27.1M
|
||||
300K .......... .......... .......... .......... .......... 37.0M
|
||||
350K .......... .......... .......... .......... .......... 31.8M
|
||||
400K .......... .......... .......... .......... .......... 58.2M
|
||||
450K .......... .......... .......... .......... .......... 44.6M
|
||||
500K .......... .......... .......... .......... .......... 40.7M
|
||||
550K .......... .......... .......... .......... .......... 48.7M
|
||||
600K .......... .......... .......... .......... .......... 719M
|
||||
650K .......... .......... .......... .......... .......... 62.0M
|
||||
700K .......... .......... .......... .......... .......... 61.0M
|
||||
750K .......... .......... .......... .......... .......... 144M
|
||||
800K .......... .......... .......... .......... .......... 270M
|
||||
850K .......... .......... .......... .......... .......... 36.5M
|
||||
900K .......... .......... .......... .......... .......... 64.1M
|
||||
950K .......... .......... .......... .......... .......... 204M
|
||||
1000K .......... .......... .......... .......... .......... 102M
|
||||
1050K .......... .......... .......... .......... .......... 90.0M
|
||||
1100K .......... .......... .......... .......... .......... 179M
|
||||
1150K .......... .......... .......... .......... .......... 132M
|
||||
1200K .......... .......... .......... .......... .......... 84.9M
|
||||
1250K .......... .......... .......... .......... .......... 90.3M
|
||||
1300K .......... .......... .......... .......... .......... 141M
|
||||
1350K .......... .......... .......... .......... .......... 187M
|
||||
1400K .......... .......... .......... .......... .......... 116M
|
||||
1450K .......... .......... .......... .......... .......... 86.2M
|
||||
1500K .......... .......... .......... .......... .......... 118M
|
||||
1550K .......... .......... .......... .......... .......... 113M
|
||||
1600K .......... .......... .......... .......... .......... 120M
|
||||
1650K .......... .......... .......... .......... .......... 113M
|
||||
1700K .......... .......... .......... .......... .......... 113M
|
||||
1750K .......... .......... .......... .......... .......... 107M
|
||||
1800K .......... .......... .......... .......... .......... 113M
|
||||
1850K .......... .......... .......... .......... .......... 5.40M
|
||||
1900K .......... .......... .......... .......... .......... 93.9M
|
||||
1950K .......... .......... .......... .......... .......... 104M
|
||||
2000K .......... .......... .......... .......... .......... 85.4M
|
||||
2050K .......... .......... .......... .......... .......... 126M
|
||||
2100K .......... .......... .......... .......... .......... 27.8M
|
||||
2150K .......... .......... .......... .......... .......... 9.09M
|
||||
2200K .......... .......... .......... .......... .......... 119M
|
||||
2250K .......... .......... .......... .......... .......... 17.0M
|
||||
2300K .......... .......... .......... .......... .......... 21.5M
|
||||
2350K .......... .......... .......... .......... .......... 128M
|
||||
2400K .......... .......... .......... .......... .......... 117M
|
||||
2450K .......... .......... .......... .......... .......... 88.9M
|
||||
2500K .......... .......... .......... .......... .......... 16.9M
|
||||
2550K .. 5.53T=0.7s
|
||||
|
||||
2025-06-22 20:08:55 (3.38 MB/s) - ‘i.html?&_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240.2’ saved [2614240]
|
||||
|
Reference in New Issue
Block a user