Initial rough commit

This commit is contained in:
2025-06-22 03:00:41 -04:00
parent 5975323678
commit dd7a5ff78e
21 changed files with 12347 additions and 0 deletions

32
.github/workflows/ci.yml vendored Normal file
View File

@ -0,0 +1,32 @@
name: Cargo Build & Test
on:
push:
pull_request:
env:
CARGO_TERM_COLOR: always
jobs:
build_and_test:
name: Rust project - latest
runs-on: ubuntu-latest
strategy:
matrix:
rust_ver: # Try to not let this blow up in size, lets keep this to ~4
- 1.88
- 1.87
- 1.86
- 1.85.1
# Very ugly way to force cargo and jazz to be in PATH across run's but
# lets fix this alter.
steps:
- uses: actions/checkout@v4
- run: curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain none -y
- run: echo "$HOME/.cargo/bin" >> $GITHUB_PATH
- run: rustup update ${{ matrix.rust_ver }} && rustup default ${{ matrix.rust_ver }}
- run: cargo build
- run: cargo test
- run: cargo build --release
- run: cargo test --release
- run: cargo bench

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/target

2554
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

24
Cargo.toml Normal file
View File

@ -0,0 +1,24 @@
[package]
name = "ebay_scraper_rust"
version = "0.1.0"
edition = "2024"
[dependencies]
actix-web = "4.11.0"
chrono = { version = "0.4.41", features = ["serde"] }
clap = { version = "4.5.40", features = ["derive"] }
dirs = "6.0.0"
env_logger = "0.11.8"
lazy_static = "1.5.0"
log = "0.4.27"
regex = "1.11.1"
rusqlite = { version = "0.36.0", features = ["bundled", "chrono"] }
scraper = "0.23.1"
serde = { version = "1.0.219", features = ["derive"] }
serde_json = "1.0.140"
test-log = { version = "0.2.17", features = ["trace"] }
tracing = "0.1.41"
tracing-subscriber = "0.3.19"
[dev-dependencies]
similar-asserts = "1.7.0"

349
src/db.rs Normal file
View File

@ -0,0 +1,349 @@
use chrono::{DateTime, Utc};
use rusqlite::Connection;
use serde::Serialize;
use std::path::Path;
#[derive(Serialize, Debug, PartialEq, Clone)]
pub struct SearchURL {
pub full_url: String,
pub name: String,
}
impl SearchURL {
pub fn initialized_db(conn: &Connection) {
conn.execute(
"CREATE TABLE IF NOT EXISTS SearchURLs (
id INTEGER PRIMARY KEY,
url TEXT NOT NULL UNIQUE,
name TEXT NOT NULL UNIQUE
)",
(),
)
.unwrap();
}
pub fn lookup(conn: &Connection, name: &str) -> Option<Self> {
let mut stmt = conn
.prepare("SELECT * FROM SearchURLs WHERE name = ?")
.ok()?;
stmt.query_one([name], |row| {
Ok(SearchURL {
// id: row.get(0)?,
full_url: row.get(1)?,
name: row.get(2)?,
})
})
.ok()
}
pub fn add_or_update(&self, conn: &Connection) {
let _ = conn
.execute(
"INSERT OR REPLACE INTO SearchURLs (name, url) VALUES (?1, ?2)",
(&self.name, &self.full_url),
)
.unwrap();
}
pub fn names(conn: &Connection) -> Vec<String> {
let mut stmt = conn.prepare("SELECT name FROM SearchURLs").ok().unwrap();
stmt.query_map([], |row| Ok(row.get(0)))
.ok()
.unwrap()
.map(|e| e.unwrap())
.flatten()
.collect()
}
}
#[derive(Serialize, Debug, PartialEq, Clone)]
pub struct ParsedPage {
pub timestamp: DateTime<Utc>,
pub category: String,
}
impl ParsedPage {
pub fn initialized_db(conn: &Connection) {
conn.execute(
"CREATE TABLE IF NOT EXISTS Pages_Parsed (
id INTEGER PRIMARY KEY,
category TEXT NOT NULL,
timestamp INTEGER NOT NULL,
UNIQUE(category, timestamp)
FOREIGN KEY(category) REFERENCES SearchURLs(name)
)",
(),
)
.unwrap();
}
pub fn lookup_db(conn: &Connection, timestamp: DateTime<Utc>) -> Option<Self> {
let mut stmt = conn
.prepare("SELECT * FROM Pages_Parsed WHERE timestamp = ?")
.ok()?;
stmt.query_one([timestamp], |row| {
Ok(ParsedPage {
// id: row.get(0)?,
category: row.get(1)?,
timestamp: row.get(2)?,
})
})
.ok()
}
pub fn add_or_update_db(&self, conn: &Connection) {
let _ = conn
.execute(
"INSERT OR REPLACE INTO Pages_Parsed (category, timestamp) VALUES (?1, ?2)",
(&self.category, self.timestamp),
)
.unwrap();
}
}
#[derive(Serialize, Debug, PartialEq, Copy, Clone)]
pub struct ParsedStorage {
pub id: i64,
pub item: i64,
pub total_gigabytes: i64,
pub quantity: i64,
pub individual_size_gigabytes: i64,
pub parse_engine: i64,
pub needed_description_check: bool,
}
impl ParsedStorage {
pub fn initialized_db(conn: &Connection) {
conn.execute(
"CREATE TABLE IF NOT EXISTS Storage_Parsed (
id INTEGER PRIMARY KEY,
item INTEGER,
total_gigabytes INTEGER,
quantity INTEGER,
sizes_gigabytes TEXT,
parse_engine INTEGER,
need_description_check INTEGER,
UNIQUE(item, parse_engine)
FOREIGN KEY(item) REFERENCES Ebay_Items(item_id)
)",
(),
)
.unwrap();
}
pub fn lookup_db(conn: &Connection, item: i64) -> Vec<ParsedStorage> {
let mut stmt = conn
.prepare("SELECT * FROM Storage_Parsed WHERE id = ?")
.ok()
.unwrap();
stmt.query_map([item], |row| {
Ok(ParsedStorage {
id: row.get(0)?,
item: row.get(1)?,
total_gigabytes: row.get(2)?,
quantity: row.get(3)?,
individual_size_gigabytes: {
let r: String = row.get(4)?;
r.parse().unwrap()
},
parse_engine: row.get(5)?,
needed_description_check: row.get(6)?,
})
})
.ok()
.unwrap()
.map(|e| e.unwrap())
.collect()
}
pub fn add_or_update_db(&self, conn: &Connection) {
let _ = conn.execute("
INSERT OR REPLACE INTO Storage_Parsed (item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, need_description_check)
VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
(
&self.item,
self.total_gigabytes,
self.quantity,
self.individual_size_gigabytes.to_string(),
self.parse_engine,
self.needed_description_check
)
).unwrap();
}
}
#[derive(Serialize, Debug, PartialEq, Clone)]
pub struct Listing {
pub id: i64,
pub item_id: i64,
pub title: String,
pub added_time: DateTime<Utc>,
pub current_bid_price: Option<f64>,
pub buy_it_now_price: Option<f64>,
pub has_best_offer: bool,
pub image_url: String,
}
impl Listing {
fn initialized(conn: &Connection) {
conn.execute(
"CREATE TABLE IF NOT EXISTS Ebay_Items (
id INTEGER PRIMARY KEY,
item_id INTEGER NOT NULL UNIQUE,
title TEXT NOT NULL,
added_time INTEGER NOT NULL,
current_bid_usd_cents INTEGER,
buy_it_now_usd_cents INTEGER,
has_best_offer INTEGER NOT NULL,
image_url TEXT NOT NULL
)",
(),
)
.unwrap();
}
pub fn lookup(conn: &Connection, item_id: i64) -> Option<Listing> {
let mut stmt = conn
.prepare("SELECT * FROM Ebay_Items WHERE item_id = ?")
.ok()?;
stmt.query_one([item_id], |row| {
Ok(Listing {
id: row.get(0)?,
item_id: row.get(1)?,
title: row.get(2)?,
added_time: row.get(3)?,
current_bid_price: row.get(4)?,
buy_it_now_price: row.get(5)?,
has_best_offer: row.get(6)?,
image_url: row.get(7)?,
})
})
.ok()
}
pub fn lookup_since(conn: &Connection, since: i64, limit: i64) -> Vec<Self> {
let mut stmt = conn
.prepare(
"SELECT * FROM Ebay_Items
WHERE added_time >= ?1
ORDER BY added_time
LIMIT ?2
",
)
.ok()
.unwrap();
stmt.query_map([since, limit], |row| {
Ok(Listing {
id: row.get(0)?,
item_id: row.get(1)?,
title: row.get(2)?,
added_time: row.get(3)?,
current_bid_price: row.get(4)?,
buy_it_now_price: row.get(5)?,
has_best_offer: row.get(6)?,
image_url: row.get(7)?,
})
})
.ok()
.unwrap()
.map(|e| e.unwrap())
.collect()
}
pub fn lookup_non_parsed(conn: &Connection) -> Vec<(i64, String)> {
let mut stmt = conn
.prepare(
"
SELECT ei.item_id, ei.title FROM Ebay_Items AS ei
LEFT JOIN Storage_Parsed AS sp ON ei.item_id = sp.item
WHERE sp.item IS NULL",
)
.ok()
.unwrap();
stmt.query_map([], |row| Ok((row.get(0)?, row.get(1)?)))
.ok()
.unwrap()
.map(|e| e.unwrap())
.collect()
}
pub fn add_or_update(&self, conn: &Connection) {
let count = conn.execute(
"INSERT OR REPLACE INTO Ebay_Items (item_id, title, added_time, current_bid_usd_cents, buy_it_now_usd_cents, has_best_offer, image_url)
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
(
self.item_id,
&self.title,
self.added_time,
self.current_bid_price,
self.buy_it_now_price,
self.has_best_offer,
self.image_url.clone()
)
).unwrap();
if count != 1 {
panic!("Expected count to be 1 but got {}", count);
}
}
}
pub fn get_initialized(path: Option<&Path>) -> Connection {
let conn = match path {
Some(p) => Connection::open(&p),
None => Connection::open_in_memory(),
}
.unwrap();
SearchURL::initialized_db(&conn);
Listing::initialized(&conn);
ParsedStorage::initialized_db(&conn);
ParsedPage::initialized_db(&conn);
conn
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn sanity_check() {
let db = get_initialized(None);
let searchurl = SearchURL {
full_url: "google".to_owned(),
name: "ssd".to_owned(),
};
searchurl.add_or_update(&db);
assert_eq!(SearchURL::lookup(&db, &searchurl.name), Some(searchurl));
let listing = Listing {
id: 1,
item_id: 1234,
title: "Some Title".to_string(),
added_time: std::time::SystemTime::now().into(),
current_bid_price: Some(0.12),
buy_it_now_price: Some(1.23),
has_best_offer: false,
image_url: "google.com".to_string(),
};
listing.add_or_update(&db);
assert_eq!(Listing::lookup(&db, listing.item_id), Some(listing.clone()));
let parsed = ParsedStorage {
id: 1,
item: 1234,
total_gigabytes: 13,
quantity: 3,
individual_size_gigabytes: 13,
parse_engine: 9,
needed_description_check: true,
};
parsed.add_or_update_db(&db);
assert_eq!(ParsedStorage::lookup_db(&db, listing.id), vec![parsed]);
let page = ParsedPage {
category: "ssd".to_owned(),
timestamp: std::time::SystemTime::now().into(),
};
page.add_or_update_db(&db);
assert_eq!(ParsedPage::lookup_db(&db, page.timestamp), Some(page));
}
}

4
src/lib.rs Normal file
View File

@ -0,0 +1,4 @@
pub mod db;
pub mod parser;
pub mod parser_ebay;
pub mod parser_storage;

190
src/main.rs Normal file
View File

@ -0,0 +1,190 @@
use actix_web::{App, HttpServer, Responder, Result, get, post, web, web::Data};
use clap::Parser;
use ebay_scraper_rust::db::{Listing, ParsedPage, ParsedStorage, SearchURL, get_initialized};
use ebay_scraper_rust::{parser_ebay, parser_storage};
use std::path::{Path, PathBuf};
use std::sync::Mutex;
use tracing::info;
mod xdg_dirs;
#[derive(Parser, Debug)]
#[clap(
name = "ebay-scraper-rust",
version = "0.1.0",
about = "Scrapes eBay search results for homelab purposes"
)]
struct Args {}
#[get("/page/{timestamp}")]
async fn page_get(
db: Data<Mutex<rusqlite::Connection>>,
timestamp: web::Path<i64>,
) -> Result<impl Responder> {
Ok(web::Json(ParsedPage::lookup_db(
&db.lock().unwrap(),
chrono::DateTime::from_timestamp(*timestamp, 0).unwrap(),
)))
}
#[get("/listing/{id}")]
async fn listing_get(
db: Data<Mutex<rusqlite::Connection>>,
id: web::Path<i64>,
) -> Result<impl Responder> {
Ok(web::Json(Listing::lookup(&db.lock().unwrap(), *id)))
}
#[get("/listing/since/{timestamp}/{limit}")]
async fn listing_since_get(
db: Data<Mutex<rusqlite::Connection>>,
req: web::Path<(i64, i64)>,
) -> Result<impl Responder> {
Ok(web::Json(Listing::lookup_since(
&db.lock().unwrap(),
req.0,
req.1,
)))
}
#[post("listing/parse")]
async fn parse_listings(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
let mut cnt = 0;
let db_unlocked = db.lock().unwrap();
Listing::lookup_non_parsed(&db_unlocked)
.iter()
.map(|l| parser_storage::parse_size_and_quantity(l.0, &l.1))
.inspect(|_| cnt = cnt + 1)
.for_each(|ps| ps.add_or_update_db(&db_unlocked));
Ok(web::Json(cnt))
}
#[get("listing/parse/{id}")]
async fn listing_parse_get(
db: Data<Mutex<rusqlite::Connection>>,
id: web::Path<i64>,
) -> Result<impl Responder> {
Ok(web::Json(ParsedStorage::lookup_db(
&db.lock().unwrap(),
*id,
)))
}
pub fn timestamps_from_dir(path: &Path) -> Vec<i64> {
if !std::fs::exists(path).expect("Directory must exist") {
panic!(
"Directory {:?} does not exist, cannot grab timestamps from there.",
path
);
}
std::fs::read_dir(path)
.unwrap()
.inspect(|fpath| info!("Found {:?}", fpath))
.map(|fpath| fpath.unwrap().path())
.filter_map(|fstem| {
fstem
.file_stem()
.and_then(|s| s.to_str())
.expect("Invalid file name")
.parse()
.ok()
})
.collect()
}
#[post("page/parse/{category}")]
async fn parse_post(
db: Data<Mutex<rusqlite::Connection>>,
downloaddir: Data<PathBuf>,
category: web::Path<String>,
) -> Result<impl Responder> {
let dir = &downloaddir.join(category.clone());
// Ensure the category is created.
let url: serde_json::Value =
serde_json::from_str(&std::fs::read_to_string(dir.join("url.json")).unwrap()).unwrap();
info!("{:?}", url);
let su = SearchURL {
full_url: url.to_string(),
name: category.to_string(),
};
info!("{:?}", su);
su.add_or_update(&db.lock().unwrap());
let added: u64 = timestamps_from_dir(dir)
.iter()
.filter(|t| {
info!("Checking for the existance of page {t}");
let p = ParsedPage::lookup_db(
&db.lock().unwrap(),
chrono::DateTime::from_timestamp(**t, 0).unwrap(),
);
// Timestamp never seen before, lets pass it on.
if p.is_none() {
return true;
}
// Timestamp was seen before *and* from the same catagory, don't pass
// it on.
if p.unwrap().category == *category {
return false;
}
return true;
})
.map(|t| {
let timestamp = chrono::DateTime::from_timestamp(*t, 0).unwrap();
info!("Adding or updating db with timestamp:{timestamp} catagory:{category}");
ParsedPage {
timestamp: timestamp,
category: category.to_string(),
}
.add_or_update_db(&db.lock().unwrap());
let mut cnt = 0;
parser_ebay::extract_data_from_html(
&std::fs::read_to_string(dir.join(format!("{t}.html"))).unwrap(),
&timestamp,
)
.unwrap()
.iter()
.for_each(|l| {
cnt = cnt + 1;
l.add_or_update(&db.lock().unwrap());
info!("Inserting id:{}, title:{}", l.item_id, l.title);
});
cnt
})
.sum();
Ok(added.to_string())
}
#[actix_web::main]
async fn main() -> std::io::Result<()> {
env_logger::init_from_env(env_logger::Env::new().default_filter_or("info"));
let _ = Args::parse();
let scrapedatadir = xdg_dirs::ensure_scrapedata_dir_exists("ebay_scraper", None);
info!(
"Starting with scraped data dir of \"{}\".",
scrapedatadir.to_str().unwrap()
);
let db_mutex = Data::new(Mutex::new(get_initialized(None)));
HttpServer::new(move || {
App::new()
.service(page_get)
.service(listing_get)
.service(listing_since_get)
.service(parse_post)
.service(parse_listings)
.app_data(db_mutex.clone())
.app_data(Data::new(scrapedatadir.clone()))
})
.bind(("127.0.0.1", 8080))?
.run()
.await
}

1
src/parser.rs Normal file
View File

@ -0,0 +1 @@

209
src/parser_ebay.rs Normal file
View File

@ -0,0 +1,209 @@
use crate::db::Listing;
use chrono::Utc;
use lazy_static::lazy_static;
use regex::Regex;
use scraper::{Html, Selector};
use tracing::{debug, info, warn};
lazy_static! {
static ref PRICE_REGEX: Regex = Regex::new(r"\$?([\d,]+\.?\d*)").unwrap();
static ref ITEM_ID_REGEX: Regex = Regex::new(r"/itm/(\d+)").unwrap();
static ref NEW_LISTING_REGEX: Regex = Regex::new(r"(?i)^\s*NEW LISTING\s*[:\-\s]*").unwrap();
}
/// Parses price from a string, taking the first price if it's a range.
fn parse_price(price_text: &str) -> Option<f64> {
let lower_price_text = price_text.to_lowercase();
if lower_price_text.contains(" to ") {
if let Some(first_part) = lower_price_text.split(" to ").next() {
if let Some(caps) = PRICE_REGEX.captures(first_part) {
if let Some(price_match) = caps.get(1) {
info!("Price string:{:?} parsed!", price_match);
return price_match.as_str().replace(',', "").parse().ok();
}
}
}
info!(
"Price string:{:?} failed parsing with to, returning none.",
price_text
);
return None;
}
if let Some(caps) = PRICE_REGEX.captures(price_text) {
if let Some(price_match) = caps.get(1) {
let p = price_match.as_str().replace(',', "").parse().ok();
debug!(
"price regex passed, working on caps:{:?}, price_match:{:?}, p:{:?}",
caps, price_match, p
);
return p;
}
}
info!(
"Price string:{:?} failed parsing, returning none.",
price_text
);
None
}
/// Extracts item data from HTML content.
pub fn extract_data_from_html(
html_content: &str,
timestamp: &chrono::DateTime<Utc>,
) -> Option<Vec<Listing>> {
let document = Html::parse_document(html_content);
let mut items = Vec::new();
let item_selector =
Selector::parse("li.s-item, li.srp-results__item, div.s-item[role='listitem']").unwrap();
let title_selector = Selector::parse(".s-item__title, .srp-results__title").unwrap();
let price_selector = Selector::parse(".s-item__price").unwrap();
let image_selector =
Selector::parse(".s-item__image-wrapper img.s-item__image-img, .s-item__image img")
.unwrap();
let link_selector =
Selector::parse("a.s-item__link[href*='/itm/'], .s-item__info > a[href*='/itm/']").unwrap();
let bid_count_selector =
Selector::parse(".s-item__bid-count, .s-item__bids, .s-item__bidCount").unwrap();
let primary_info_selector = Selector::parse(".s-item__detail--primary").unwrap();
let _secondary_info_selector = Selector::parse(".s-item__detail--secondary").unwrap();
let auction_bin_price_selector = Selector::parse(".s-item__buy-it-now-price").unwrap();
for element in document.select(&item_selector) {
let raw_title_text = element
.select(&title_selector)
.next()
.map(|el| el.text().collect::<String>().trim().to_string());
let price_text = element
.select(&price_selector)
.next()
.map(|el| el.text().collect::<String>().trim().to_string());
let id = element
.select(&link_selector)
.next()
.and_then(|link_el| link_el.value().attr("href"))
.and_then(|href| ITEM_ID_REGEX.captures(href))
.and_then(|caps| caps.get(1).map(|m| m.as_str().to_string()))
.and_then(|id_str| id_str.parse::<i64>().ok());
if raw_title_text.is_none() || price_text.is_none() || id.is_none() {
warn!(
"Skipping {:?} due to missing title, price, or item ID.",
element
);
continue;
}
if id.unwrap() == 123456 {
info!("Skipping {:?} due to bogus ID of 123456", element);
continue;
}
let raw_title = raw_title_text.unwrap();
let price_text = price_text.unwrap();
let title = NEW_LISTING_REGEX.replace(&raw_title, "").trim().to_string();
let primary_display_price = parse_price(&price_text);
let mut current_bid_price: Option<f64> = None;
let mut final_buy_it_now_price: Option<f64> = None;
let mut item_is_auction = false;
if let Some(bid_el) = element.select(&bid_count_selector).next() {
if bid_el
.text()
.collect::<String>()
.to_lowercase()
.contains("bid")
{
item_is_auction = true;
}
}
let has_best_offer = element
.select(&primary_info_selector)
.any(|e| e.text().any(|e| e.to_lowercase().contains("or best offer")));
if item_is_auction {
current_bid_price = primary_display_price;
if let Some(bin_el) = element.select(&auction_bin_price_selector).next() {
final_buy_it_now_price = parse_price(&bin_el.text().collect::<String>());
}
} else {
final_buy_it_now_price = primary_display_price;
}
let image_url = element
.select(&image_selector)
.next()
.and_then(|img_el| {
img_el
.value()
.attr("data-src")
.or(img_el.value().attr("src"))
})
.map(|s| s.to_string())
.unwrap();
items.push(Listing {
title,
id: 0,
item_id: id?,
added_time: *timestamp,
current_bid_price,
buy_it_now_price: final_buy_it_now_price,
has_best_offer,
image_url,
});
}
Some(items)
}
#[cfg(test)]
mod tests {
use super::*;
use similar_asserts::assert_eq;
#[test_log::test]
fn parse() {
let timestamp = chrono::DateTime::from_timestamp(1750369463, 0).unwrap();
let html = include_str!("../test_data/ebay_scraper/raw_scraped/ssd/1750369463.html");
let parsed = extract_data_from_html(html, &timestamp).unwrap();
// assert_eq!(parsed.len(), 62);
let parsed = parsed.first_chunk::<10>().unwrap();
assert_eq!(
parsed[0],
Listing {
id: 0,
item_id: 388484391867,
title: "WD Blue 2.5-Inch 3D NAND SATA SSD 1TB - WDBNCE0010PNC-WRSN".to_string(),
added_time: timestamp,
current_bid_price: None,
buy_it_now_price: Some(59.99),
has_best_offer: true,
image_url: "https://i.ebayimg.com/images/g/wQYAAeSwOTtoN8SC/s-l500.webp"
.to_string()
}
);
assert_eq!(
parsed[4],
Listing {
id: 0,
item_id: 286605201240,
title:
"Fanxiang M.2 SSD 1TB NVMe PCIe Gen 3x 4 M2 Internal Solid State Drive 3500MB/s"
.to_string(),
added_time: timestamp,
current_bid_price: Some(12.60),
buy_it_now_price: None,
has_best_offer: true,
image_url: "https://i.ebayimg.com/images/g/3NoAAeSwPrtoDb1O/s-l500.webp"
.to_string()
}
);
}
}

208
src/parser_storage.rs Normal file
View File

@ -0,0 +1,208 @@
use crate::db::ParsedStorage;
use lazy_static::lazy_static;
use regex::*;
// let parsed_size_info = crate::parser_storage::parse_size_and_quantity(&cleaned_title);
// let _cost_per_tb = if let Some(price) = primary_display_price {
// if parsed_size_info.total_tb > 0.0 {
// Some(((price / parsed_size_info.total_tb) * 100.0).round() / 100.0)
// } else {
// None
// }
// } else {
// None
// };
lazy_static! {
static ref EXPLICIT_QTY_PATTERNS: Vec<Regex> = vec![
Regex::new(r"\b(?:LOT\s+OF|LOT)\s*\(?\s*(\d+)\s*\)?").unwrap(),
Regex::new(r"\b(?:LOT\s+OF|LOT)\s*\*\s*(\d+)").unwrap(),
Regex::new(r"\b(?:PACK\s+OF|PACK|BULK)\s*\(?\s*(\d+)\s*\)?").unwrap(),
Regex::new(r"\b(\d+)\s*-\s*PACK\b").unwrap(),
Regex::new(r"\b(\d+)\s*COUNT\b").unwrap(),
];
static ref SIZE_REGEX: Regex = Regex::new(r"(\d+(?:\.\d+)?)\s*(TB|GB)\b").unwrap();
static ref SIZE_RANGE_REGEX: Regex =
Regex::new(r"\d+(?:\.\d+)?\s*(?:GB|TB)\s*(?:-|&|OR|TO)\s*\d+(?:\.\d+)?\s*(?:GB|TB)")
.unwrap();
}
/// Parses size and quantity information from an item title.
pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage {
let upper_title = title.to_uppercase();
let mut total_gb = 0i64;
let mut quantity = 1i64;
let mut needed_description_check = false;
let mut individual_size_gb = 0i64;
for pattern in EXPLICIT_QTY_PATTERNS.iter() {
if let Some(caps) = pattern.captures(&upper_title) {
if let Some(qty_match) = caps.get(1) {
if let Ok(parsed_qty) = qty_match.as_str().parse::<i64>() {
if parsed_qty > 0 && parsed_qty < 500 {
quantity = parsed_qty;
break;
}
}
}
}
}
let mut size_matches: Vec<(f64, String)> = Vec::new();
for caps in SIZE_REGEX.captures_iter(&upper_title) {
if let (Some(val_str), Some(unit_str)) = (caps.get(1), caps.get(2)) {
if let Ok(val) = val_str.as_str().parse::<f64>() {
size_matches.push((val, unit_str.as_str().to_string()));
}
}
}
if !size_matches.is_empty() {
let mut unique_sizes_gb: Vec<i64> = size_matches
.iter()
.map(|(val, unit)| { if unit == "TB" { *val * 1024.0 } else { *val } } as i64)
.collect();
unique_sizes_gb.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
unique_sizes_gb.dedup();
if !unique_sizes_gb.is_empty() {
individual_size_gb = unique_sizes_gb[0];
if unique_sizes_gb.len() > 1 {
needed_description_check = true;
}
}
}
if SIZE_RANGE_REGEX.is_match(&upper_title) {
needed_description_check = true;
}
if quantity > 1 && upper_title.contains("MIXED") {
needed_description_check = true;
}
if upper_title.contains("CHECK THE DESCRIPTION")
|| upper_title.contains("CHECK DESCRIPTION")
|| upper_title.contains("SEE DESCRIPTION")
{
if quantity > 1 || size_matches.is_empty() || size_matches.len() > 1 {
needed_description_check = true;
}
}
if individual_size_gb > 0 {
total_gb = individual_size_gb * quantity;
}
if quantity > 1 && total_gb == 0 && !size_matches.is_empty() {
needed_description_check = true;
}
if quantity == 1 && size_matches.len() == 1 && !needed_description_check {
// This condition is implicitly handled
}
ParsedStorage {
id: 0,
item: item_id,
total_gigabytes: total_gb,
quantity,
individual_size_gigabytes: individual_size_gb,
needed_description_check,
parse_engine: 0,
}
}
#[cfg(test)]
mod tests {
use super::*;
use similar_asserts::assert_eq;
static TESTS: &[(&str, ParsedStorage, bool)] = &[
(
"Lot Of 3 Western Digital PC SN740 512GB M.2 2230 NVMe Internal SSD",
ParsedStorage {
id: 0,
item: 0,
total_gigabytes: 512 * 3,
quantity: 3,
individual_size_gigabytes: 512,
parse_engine: 0,
needed_description_check: false,
},
true,
),
(
"Samsung SM883 2.5” 240GB SATA 6Gbps MZ7KH240HAHQ-00005",
ParsedStorage {
id: 0,
item: 1,
total_gigabytes: 240,
quantity: 1,
individual_size_gigabytes: 240,
parse_engine: 0,
needed_description_check: false,
},
true,
),
(
"1TB AData SU650 2.5-inch SATA 6Gb/s SSD Solid State Disk (READ)",
ParsedStorage {
id: 0,
item: 2,
total_gigabytes: 1024,
quantity: 1,
individual_size_gigabytes: 1024,
parse_engine: 0,
needed_description_check: true,
},
false, // Sadly this one fails :/
),
(
"Hitachi VSP 7TB Flash Module Drive (FMD) 3286734-A DKC-F810I-7R0FP",
ParsedStorage {
id: 0,
item: 4,
total_gigabytes: 7 * 1024,
quantity: 1,
individual_size_gigabytes: 7 * 1024,
parse_engine: 0,
needed_description_check: false,
},
true,
),
(
"(Lot of 6) Samsung MZ-VLB2560 256GB M.2 NVMe Internal SSD (MZVLB256HBHQ-000H1)",
ParsedStorage {
id: 0,
item: 5,
total_gigabytes: 6 * 256,
quantity: 6,
individual_size_gigabytes: 256,
parse_engine: 0,
needed_description_check: false,
},
true,
),
(
"Lenovo-Micron 5300 Pro 1.92TB 2.5\" Sata SSD 02JG540 (MTFDDAK1T9TDS)",
ParsedStorage {
id: 0,
item: 6,
total_gigabytes: 1966,
quantity: 1,
individual_size_gigabytes: 1966,
parse_engine: 0,
needed_description_check: false,
},
true,
),
];
#[test_log::test]
fn parse() {
for t in TESTS {
if t.2 {
assert_eq!(t.1, parse_size_and_quantity(t.1.item, t.0));
}
}
}
}

74
src/xdg_dirs.rs Normal file
View File

@ -0,0 +1,74 @@
use std::path::PathBuf;
use tracing::{info, warn};
enum XdgType {
XdgDataHome,
#[allow(dead_code)]
XdgConfigHome,
#[allow(dead_code)]
XdgCacheHome,
#[allow(dead_code)]
XdgStateHome,
#[allow(dead_code)]
XdgRuntimeDir,
}
fn ensure_xdg_dir_exists(app_name: &str, xdg_type: XdgType) -> PathBuf {
let dir = match xdg_type {
XdgType::XdgDataHome => dirs::data_local_dir().unwrap_or_else(|| {
let d = dirs::home_dir().unwrap().join(".local").join(".share");
warn!(
"OS returned no data local dir (XDG_DATA_HOME), HOME is {:?}, using {:?}!",
dirs::home_dir(),
d
);
d
}),
XdgType::XdgConfigHome => todo!("Not yet tested/implimented"),
XdgType::XdgCacheHome => todo!("Not yet tested/implimented"),
XdgType::XdgStateHome => todo!("Not yet tested/implimented"),
XdgType::XdgRuntimeDir => todo!("Not yet tested/implimented"),
};
if !std::fs::exists(&dir).unwrap() {
panic!(
"Base directory to use for this app does not exist at {:?}",
dir
);
}
let dir = dir.join(app_name);
if !std::fs::exists(&dir).unwrap() {
info!(
"App directory to use for this app does not exist at {:?}, creating now.",
dir
);
std::fs::create_dir(&dir).unwrap();
}
dir
}
pub fn ensure_scrapedata_dir_exists(app_name: &str, override_path: Option<PathBuf>) -> PathBuf {
if override_path.is_some() {
if !std::fs::exists(&override_path.clone().unwrap()).unwrap() {
panic!(
"Override path of {:?} was given but does not eixst, bailing.",
override_path.unwrap()
);
}
return override_path.unwrap();
}
let app_dir = ensure_xdg_dir_exists(app_name, XdgType::XdgDataHome);
let raw_scraped_dir = override_path.unwrap_or(app_dir.join("raw_scraped"));
if !std::fs::exists(&raw_scraped_dir).unwrap() {
info!(
"scrape directory to use for this app does not exist at {:?}, creating now.",
raw_scraped_dir
);
std::fs::create_dir(&raw_scraped_dir).unwrap();
}
raw_scraped_dir
}

View File

@ -0,0 +1,11 @@
[Unit]
Description=Run a single instance of a fetch
After=syslog.target network.target
[Service]
Type=exec
#Environment=XDG_DATA_HOME=/home/hak8or/code/ebay_scraper_rust/.tmp_run
ExecStart=/usr/local/bin/scraper_fetch.sh
[Install]
WantedBy=multi-user.target

31
systemd/scraper_fetch.sh Executable file
View File

@ -0,0 +1,31 @@
#!/bin/env bash
URL_PER_PAGE_60="&_ipg=60"
URL_PER_PAGE_240="&_ipg=240"
URL_MIN_PRICE_USD_60="&_udlo=60.00"
URL_SEARCHTERM_NONE="&_nkw="
URL_LOCATION_NORTHAMERICA="&LH_PrefLoc=3"
URL_BASE_LISTING="https://www.ebay.com/sch/i.html?"
if [ -z "${XDG_DATA_HOME}" ]; then
echo "XDG_DATA_HOME was not set, bailing!"
exit
fi
DIR_SSDS="$XDG_DATA_HOME/ebay_scraper/raw_scraped/ssd"
mkdir -p "$DIR_SSDS"
if [ ! -s "$DIR_SSDS/url.json" ]; then
URL_CATEGORY_SSD="&_sacat=175669"
URL_SSDS="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_SSD&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1&_sop=10$URL_PER_PAGE_240"
echo "{\"url\": \"$URL_SSDS\"}" > "$DIR_SSDS/url.json"
fi
wget -o "$DIR_SSDS/$(date +%s).html" "$(jq '.url' $DIR_SSDS/url.json)"
DIR_MINIPC="$XDG_DATA_HOME/ebay_scraper/raw_scraped/minipc"
mkdir -p "$DIR_MINIPC"
if [ ! -s "$DIR_MINIPC/url.json" ]; then
URL_CATEGORY_MINIPC_ALLINONE="&_sacat=179"
URL_MINIPC="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_MINIPC_ALLINONE&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1&_sop=10$URL_PER_PAGE_240"
echo "{\"url\": \"$URL_MINIPC\"}" > "$DIR_MINIPC/url.json"
fi
wget -o "$DIR_MINIPC/$(date +%s).html" "$(jq '.url' $DIR_MINIPC/url.json)"

View File

@ -0,0 +1,10 @@
[Unit]
Description=Run a fetch of a website
[Timer]
OnBootSec=15min
OnUnitActiveSec=1h
Persistent=true
[Install]
WantedBy=timers.target

View File

@ -0,0 +1,59 @@
--2025-06-22 20:08:55-- https://www.ebay.com/sch/i.html?&_nkw=&_sacat=179&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving www.ebay.com (www.ebay.com)... 23.56.163.160
Connecting to www.ebay.com (www.ebay.com)|23.56.163.160|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: i.html?&_nkw=&_sacat=179&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240
0K .......... .......... .......... .......... .......... 6.28M
50K .......... .......... .......... .......... .......... 76.1K
100K .......... .......... .......... .......... .......... 18.6M
150K .......... .......... .......... .......... .......... 12.7M
200K .......... .......... .......... .......... .......... 34.4M
250K .......... .......... .......... .......... .......... 25.0M
300K .......... .......... .......... .......... .......... 41.3M
350K .......... .......... .......... .......... .......... 114M
400K .......... .......... .......... .......... .......... 73.4M
450K .......... .......... .......... .......... .......... 33.5M
500K .......... .......... .......... .......... .......... 50.2M
550K .......... .......... .......... .......... .......... 76.2M
600K .......... .......... .......... .......... .......... 109M
650K .......... .......... .......... .......... .......... 61.5M
700K .......... .......... .......... .......... .......... 81.1M
750K .......... .......... .......... .......... .......... 337M
800K .......... .......... .......... .......... .......... 118M
850K .......... .......... .......... .......... .......... 85.5M
900K .......... .......... .......... .......... .......... 92.6M
950K .......... .......... .......... .......... .......... 96.7M
1000K .......... .......... .......... .......... .......... 84.6M
1050K .......... .......... .......... .......... .......... 500M
1100K .......... .......... .......... .......... .......... 109M
1150K .......... .......... .......... .......... .......... 83.5M
1200K .......... .......... .......... .......... .......... 160M
1250K .......... .......... .......... .......... .......... 141M
1300K .......... .......... .......... .......... .......... 41.7M
1350K .......... .......... .......... .......... .......... 96.4M
1400K .......... .......... .......... .......... .......... 2.47M
1450K .......... .......... .......... .......... .......... 36.6M
1500K .......... .......... .......... .......... .......... 83.5M
1550K .......... .......... .......... .......... .......... 71.7M
1600K .......... .......... .......... .......... .......... 37.7M
1650K .......... .......... .......... .......... .......... 104M
1700K .......... .......... .......... .......... .......... 73.7M
1750K .......... .......... .......... .......... .......... 115M
1800K .......... .......... .......... .......... .......... 85.3M
1850K .......... .......... .......... .......... .......... 140M
1900K .......... .......... .......... .......... .......... 71.1M
1950K .......... .......... .......... .......... .......... 112M
2000K .......... .......... .......... .......... .......... 75.4M
2050K .......... .......... .......... .......... .......... 120M
2100K .......... .......... .......... .......... .......... 112M
2150K .......... .......... .......... .......... .......... 117M
2200K .......... .......... .......... .......... .......... 108M
2250K .......... .......... .......... .......... .......... 97.1M
2300K .......... .......... .......... .......... .......... 31.8M
2350K ...... 11.4T=0.7s
2025-06-22 20:08:56 (3.20 MB/s) - i.html?&_nkw=&_sacat=179&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240 saved [2412662]

View File

@ -0,0 +1 @@
{"url": "https://www.ebay.com/sch/i.html?&_nkw=&_sacat=179&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240"}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,63 @@
--2025-06-22 20:08:17-- https://www.ebay.com/sch/i.html?&_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving www.ebay.com (www.ebay.com)... 23.56.163.160
Connecting to www.ebay.com (www.ebay.com)|23.56.163.160|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: i.html?&_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240
0K .......... .......... .......... .......... .......... 8.04M
50K .......... .......... .......... .......... .......... 83.3K
100K .......... .......... .......... .......... .......... 1.38M
150K .......... .......... .......... .......... .......... 7.12M
200K .......... .......... .......... .......... .......... 18.8M
250K .......... .......... .......... .......... .......... 18.0M
300K .......... .......... .......... .......... .......... 19.4M
350K .......... .......... .......... .......... .......... 48.4M
400K .......... .......... .......... .......... .......... 45.9M
450K .......... .......... .......... .......... .......... 50.4M
500K .......... .......... .......... .......... .......... 50.1M
550K .......... .......... .......... .......... .......... 119M
600K .......... .......... .......... .......... .......... 45.4M
650K .......... .......... .......... .......... .......... 44.1M
700K .......... .......... .......... .......... .......... 59.1M
750K .......... .......... .......... .......... .......... 84.0M
800K .......... .......... .......... .......... .......... 167M
850K .......... .......... .......... .......... .......... 76.6M
900K .......... .......... .......... .......... .......... 59.4M
950K .......... .......... .......... .......... .......... 60.3M
1000K .......... .......... .......... .......... .......... 113M
1050K .......... .......... .......... .......... .......... 592M
1100K .......... .......... .......... .......... .......... 53.9M
1150K .......... .......... .......... .......... .......... 101M
1200K .......... .......... .......... .......... .......... 91.9M
1250K .......... .......... .......... .......... .......... 108M
1300K .......... .......... .......... .......... .......... 85.2M
1350K .......... .......... .......... .......... .......... 96.9M
1400K .......... .......... .......... .......... .......... 93.5M
1450K .......... .......... .......... .......... .......... 51.2M
1500K .......... .......... .......... .......... .......... 69.9M
1550K .......... .......... .......... .......... .......... 654M
1600K .......... .......... .......... .......... .......... 185M
1650K .......... .......... .......... .......... .......... 9.94M
1700K .......... .......... .......... .......... .......... 27.5M
1750K .......... .......... .......... .......... .......... 613M
1800K .......... .......... .......... .......... .......... 659M
1850K .......... .......... .......... .......... .......... 21.3M
1900K .......... .......... .......... .......... .......... 107M
1950K .......... .......... .......... .......... .......... 158M
2000K .......... .......... .......... .......... .......... 37.8M
2050K .......... .......... .......... .......... .......... 85.2M
2100K .......... .......... .......... .......... .......... 26.0M
2150K .......... .......... .......... .......... .......... 57.1M
2200K .......... .......... .......... .......... .......... 114M
2250K .......... .......... .......... .......... .......... 117M
2300K .......... .......... .......... .......... .......... 57.9M
2350K .......... .......... .......... .......... .......... 127M
2400K .......... .......... .......... .......... .......... 118M
2450K .......... .......... .......... .......... .......... 62.1M
2500K .......... .......... .......... .......... .......... 157M
2550K 723G=0.7s
2025-06-22 20:08:18 (3.60 MB/s) - i.html?&_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240 saved [2611588]

View File

@ -0,0 +1,63 @@
--2025-06-22 20:08:54-- https://www.ebay.com/sch/i.html?&_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving www.ebay.com (www.ebay.com)... 23.56.163.160
Connecting to www.ebay.com (www.ebay.com)|23.56.163.160|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: i.html?&_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240.2
0K .......... .......... .......... .......... .......... 4.98M
50K .......... .......... .......... .......... .......... 75.7K
100K .......... .......... .......... .......... .......... 4.26M
150K .......... .......... .......... .......... .......... 10.1M
200K .......... .......... .......... .......... .......... 25.3M
250K .......... .......... .......... .......... .......... 27.1M
300K .......... .......... .......... .......... .......... 37.0M
350K .......... .......... .......... .......... .......... 31.8M
400K .......... .......... .......... .......... .......... 58.2M
450K .......... .......... .......... .......... .......... 44.6M
500K .......... .......... .......... .......... .......... 40.7M
550K .......... .......... .......... .......... .......... 48.7M
600K .......... .......... .......... .......... .......... 719M
650K .......... .......... .......... .......... .......... 62.0M
700K .......... .......... .......... .......... .......... 61.0M
750K .......... .......... .......... .......... .......... 144M
800K .......... .......... .......... .......... .......... 270M
850K .......... .......... .......... .......... .......... 36.5M
900K .......... .......... .......... .......... .......... 64.1M
950K .......... .......... .......... .......... .......... 204M
1000K .......... .......... .......... .......... .......... 102M
1050K .......... .......... .......... .......... .......... 90.0M
1100K .......... .......... .......... .......... .......... 179M
1150K .......... .......... .......... .......... .......... 132M
1200K .......... .......... .......... .......... .......... 84.9M
1250K .......... .......... .......... .......... .......... 90.3M
1300K .......... .......... .......... .......... .......... 141M
1350K .......... .......... .......... .......... .......... 187M
1400K .......... .......... .......... .......... .......... 116M
1450K .......... .......... .......... .......... .......... 86.2M
1500K .......... .......... .......... .......... .......... 118M
1550K .......... .......... .......... .......... .......... 113M
1600K .......... .......... .......... .......... .......... 120M
1650K .......... .......... .......... .......... .......... 113M
1700K .......... .......... .......... .......... .......... 113M
1750K .......... .......... .......... .......... .......... 107M
1800K .......... .......... .......... .......... .......... 113M
1850K .......... .......... .......... .......... .......... 5.40M
1900K .......... .......... .......... .......... .......... 93.9M
1950K .......... .......... .......... .......... .......... 104M
2000K .......... .......... .......... .......... .......... 85.4M
2050K .......... .......... .......... .......... .......... 126M
2100K .......... .......... .......... .......... .......... 27.8M
2150K .......... .......... .......... .......... .......... 9.09M
2200K .......... .......... .......... .......... .......... 119M
2250K .......... .......... .......... .......... .......... 17.0M
2300K .......... .......... .......... .......... .......... 21.5M
2350K .......... .......... .......... .......... .......... 128M
2400K .......... .......... .......... .......... .......... 117M
2450K .......... .......... .......... .......... .......... 88.9M
2500K .......... .......... .......... .......... .......... 16.9M
2550K .. 5.53T=0.7s
2025-06-22 20:08:55 (3.38 MB/s) - i.html?&_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240.2 saved [2614240]

View File

@ -0,0 +1 @@
{"url": "https://www.ebay.com/sch/i.html?&_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240"}