Initial rough commit
All checks were successful
Cargo Build & Test / Rust project - latest (1.88) (push) Successful in 3m30s
Cargo Build & Test / Rust project - latest (1.85.1) (push) Successful in 4m1s
Cargo Build & Test / Rust project - latest (1.87) (push) Successful in 4m5s
Cargo Build & Test / Rust project - latest (1.86) (push) Successful in 9m41s

This commit is contained in:
2025-06-22 03:00:41 -04:00
parent 5975323678
commit b9cc62e3dd
21 changed files with 12386 additions and 0 deletions

388
src/db.rs Normal file
View File

@ -0,0 +1,388 @@
use chrono::{DateTime, Utc};
use rusqlite::Connection;
use serde::Serialize;
use std::path::Path;
use tracing::info;
trait DBTable {
const TABLE_NAME: &'static str;
const TABLE_SCHEMA: &'static str;
fn initialize(conn: &Connection) {
let create_table = &format!(
"CREATE TABLE IF NOT EXISTS {} (
{}
)",
Self::TABLE_NAME,
Self::TABLE_SCHEMA
);
info!("Creating table with following schema;");
info!("{} ({})", Self::TABLE_NAME, Self::TABLE_SCHEMA);
conn.execute(create_table, ()).unwrap();
}
}
#[derive(Serialize, Debug, PartialEq, Clone)]
pub struct SearchURL {
pub full_url: String,
pub name: String,
}
impl DBTable for SearchURL {
const TABLE_NAME: &'static str = "SearchURLs";
const TABLE_SCHEMA: &'static str = "
id INTEGER PRIMARY KEY,
url TEXT NOT NULL UNIQUE,
name TEXT NOT NULL UNIQUE";
}
impl SearchURL {
pub fn lookup(conn: &Connection, name: &str) -> Option<Self> {
let mut stmt = conn
.prepare(&format!(
"SELECT * FROM {} WHERE name = ?",
Self::TABLE_NAME
))
.ok()?;
stmt.query_one([name], |row| {
Ok(SearchURL {
// id: row.get(0)?,
full_url: row.get(1)?,
name: row.get(2)?,
})
})
.ok()
}
pub fn add_or_update(&self, conn: &Connection) {
let _ = conn
.execute(
&format!(
"INSERT OR REPLACE INTO {} (name, url) VALUES (?1, ?2)",
Self::TABLE_NAME
),
(&self.name, &self.full_url),
)
.unwrap();
}
pub fn names(conn: &Connection) -> Vec<String> {
let mut stmt = conn
.prepare(&format!("SELECT name FROM {}", Self::TABLE_NAME))
.ok()
.unwrap();
stmt.query_map([], |row| Ok(row.get(0)))
.ok()
.unwrap()
.map(|e| e.unwrap())
.flatten()
.collect()
}
}
#[derive(Serialize, Debug, PartialEq, Clone)]
pub struct ParsedPage {
pub timestamp: DateTime<Utc>,
pub category: String,
}
impl DBTable for ParsedPage {
const TABLE_NAME: &'static str = "Pages_Parsed";
const TABLE_SCHEMA: &'static str = "
id INTEGER PRIMARY KEY,
category TEXT NOT NULL,
timestamp INTEGER NOT NULL,
UNIQUE(category, timestamp)
FOREIGN KEY(category) REFERENCES SearchURLs(name)
";
}
impl ParsedPage {
pub fn lookup_db(conn: &Connection, timestamp: DateTime<Utc>) -> Option<Self> {
let mut stmt = conn
.prepare(&format!(
"SELECT * FROM {} WHERE timestamp = ?",
Self::TABLE_NAME
))
.ok()?;
stmt.query_one([timestamp], |row| {
Ok(ParsedPage {
// id: row.get(0)?,
category: row.get(1)?,
timestamp: row.get(2)?,
})
})
.ok()
}
pub fn add_or_update_db(&self, conn: &Connection) {
let _ = conn
.execute(
&format!(
"INSERT OR REPLACE INTO {} (category, timestamp) VALUES (?1, ?2)",
Self::TABLE_NAME
),
(&self.category, self.timestamp),
)
.unwrap();
}
}
#[derive(Serialize, Debug, PartialEq, Copy, Clone)]
pub struct ParsedStorage {
pub id: i64,
pub item: i64,
pub total_gigabytes: i64,
pub quantity: i64,
pub individual_size_gigabytes: i64,
pub parse_engine: i64,
pub needed_description_check: bool,
}
impl DBTable for ParsedStorage {
const TABLE_NAME: &'static str = "Storage_Parsed";
const TABLE_SCHEMA: &'static str = "
id INTEGER PRIMARY KEY,
item INTEGER,
total_gigabytes INTEGER,
quantity INTEGER,
sizes_gigabytes TEXT,
parse_engine INTEGER,
need_description_check INTEGER,
UNIQUE(item, parse_engine)
FOREIGN KEY(item) REFERENCES Ebay_Items(item_id)
";
}
impl ParsedStorage {
pub fn lookup_db(conn: &Connection, item: i64) -> Vec<ParsedStorage> {
let mut stmt = conn
.prepare(&format!("SELECT * FROM {} WHERE id = ?", Self::TABLE_NAME))
.ok()
.unwrap();
stmt.query_map([item], |row| {
Ok(ParsedStorage {
id: row.get(0)?,
item: row.get(1)?,
total_gigabytes: row.get(2)?,
quantity: row.get(3)?,
individual_size_gigabytes: {
let r: String = row.get(4)?;
r.parse().unwrap()
},
parse_engine: row.get(5)?,
needed_description_check: row.get(6)?,
})
})
.ok()
.unwrap()
.map(|e| e.unwrap())
.collect()
}
pub fn add_or_update_db(&self, conn: &Connection) {
let _ = conn.execute(&format!("
INSERT OR REPLACE INTO {}
(item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, need_description_check)
VALUES
(?1, ?2, ?3, ?4, ?5, ?6)",
Self::TABLE_NAME),
(
&self.item,
self.total_gigabytes,
self.quantity,
self.individual_size_gigabytes.to_string(),
self.parse_engine,
self.needed_description_check
)
).unwrap();
}
}
#[derive(Serialize, Debug, PartialEq, Clone)]
pub struct Listing {
pub id: i64,
pub item_id: i64,
pub title: String,
pub added_time: DateTime<Utc>,
pub current_bid_price: Option<f64>,
pub buy_it_now_price: Option<f64>,
pub has_best_offer: bool,
pub image_url: String,
}
impl DBTable for Listing {
const TABLE_NAME: &'static str = "Ebay_Items";
const TABLE_SCHEMA: &'static str = "
id INTEGER PRIMARY KEY,
item_id INTEGER NOT NULL UNIQUE,
title TEXT NOT NULL,
added_time INTEGER NOT NULL,
current_bid_usd_cents INTEGER,
buy_it_now_usd_cents INTEGER,
has_best_offer INTEGER NOT NULL,
image_url TEXT NOT NULL
";
}
impl Listing {
pub fn lookup(conn: &Connection, item_id: i64) -> Option<Listing> {
let mut stmt = conn
.prepare(&format!(
"SELECT * FROM {} WHERE item_id = ?",
Self::TABLE_NAME
))
.ok()?;
stmt.query_one([item_id], |row| {
Ok(Listing {
id: row.get(0)?,
item_id: row.get(1)?,
title: row.get(2)?,
added_time: row.get(3)?,
current_bid_price: row.get(4)?,
buy_it_now_price: row.get(5)?,
has_best_offer: row.get(6)?,
image_url: row.get(7)?,
})
})
.ok()
}
pub fn lookup_since(conn: &Connection, since: i64, limit: i64) -> Vec<Self> {
let mut stmt = conn
.prepare(&format!(
"SELECT * FROM {}
WHERE added_time >= ?1
ORDER BY added_time
LIMIT ?2
",
Self::TABLE_NAME
))
.ok()
.unwrap();
stmt.query_map([since, limit], |row| {
Ok(Listing {
id: row.get(0)?,
item_id: row.get(1)?,
title: row.get(2)?,
added_time: row.get(3)?,
current_bid_price: row.get(4)?,
buy_it_now_price: row.get(5)?,
has_best_offer: row.get(6)?,
image_url: row.get(7)?,
})
})
.ok()
.unwrap()
.map(|e| e.unwrap())
.collect()
}
pub fn lookup_non_parsed(conn: &Connection) -> Vec<(i64, String)> {
let mut stmt = conn
.prepare(&format!(
"
SELECT ei.item_id, ei.title FROM {} AS ei
LEFT JOIN {} AS sp ON ei.item_id = sp.item
WHERE sp.item IS NULL",
Self::TABLE_NAME,
ParsedStorage::TABLE_NAME
))
.ok()
.unwrap();
stmt.query_map([], |row| Ok((row.get(0)?, row.get(1)?)))
.ok()
.unwrap()
.map(|e| e.unwrap())
.collect()
}
pub fn add_or_update(&self, conn: &Connection) {
let count = conn
.execute(
&format!(
"INSERT OR REPLACE INTO {}
(
item_id,
title,
added_time,
current_bid_usd_cents,
buy_it_now_usd_cents,
has_best_offer,
image_url
)
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
Self::TABLE_NAME
),
(
self.item_id,
&self.title,
self.added_time,
self.current_bid_price,
self.buy_it_now_price,
self.has_best_offer,
self.image_url.clone(),
),
)
.unwrap();
if count != 1 {
panic!("Expected count to be 1 but got {}", count);
}
}
}
pub fn get_initialized(path: Option<&Path>) -> Connection {
let conn = match path {
Some(p) => Connection::open(&p),
None => Connection::open_in_memory(),
}
.unwrap();
SearchURL::initialize(&conn);
Listing::initialize(&conn);
ParsedStorage::initialize(&conn);
ParsedPage::initialize(&conn);
conn
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn sanity_check() {
let db = get_initialized(None);
let searchurl = SearchURL {
full_url: "google".to_owned(),
name: "ssd".to_owned(),
};
searchurl.add_or_update(&db);
assert_eq!(SearchURL::lookup(&db, &searchurl.name), Some(searchurl));
let listing = Listing {
id: 1,
item_id: 1234,
title: "Some Title".to_string(),
added_time: std::time::SystemTime::now().into(),
current_bid_price: Some(0.12),
buy_it_now_price: Some(1.23),
has_best_offer: false,
image_url: "google.com".to_string(),
};
listing.add_or_update(&db);
assert_eq!(Listing::lookup(&db, listing.item_id), Some(listing.clone()));
let parsed = ParsedStorage {
id: 1,
item: 1234,
total_gigabytes: 13,
quantity: 3,
individual_size_gigabytes: 13,
parse_engine: 9,
needed_description_check: true,
};
parsed.add_or_update_db(&db);
assert_eq!(ParsedStorage::lookup_db(&db, listing.id), vec![parsed]);
let page = ParsedPage {
category: "ssd".to_owned(),
timestamp: std::time::SystemTime::now().into(),
};
page.add_or_update_db(&db);
assert_eq!(ParsedPage::lookup_db(&db, page.timestamp), Some(page));
}
}

4
src/lib.rs Normal file
View File

@ -0,0 +1,4 @@
pub mod db;
pub mod parser;
pub mod parser_ebay;
pub mod parser_storage;

190
src/main.rs Normal file
View File

@ -0,0 +1,190 @@
use actix_web::{App, HttpServer, Responder, Result, get, post, web, web::Data};
use clap::Parser;
use ebay_scraper_rust::db::{Listing, ParsedPage, ParsedStorage, SearchURL, get_initialized};
use ebay_scraper_rust::{parser_ebay, parser_storage};
use std::path::{Path, PathBuf};
use std::sync::Mutex;
use tracing::info;
mod xdg_dirs;
#[derive(Parser, Debug)]
#[clap(
name = "ebay-scraper-rust",
version = "0.1.0",
about = "Scrapes eBay search results for homelab purposes"
)]
struct Args {}
#[get("/page/{timestamp}")]
async fn page_get(
db: Data<Mutex<rusqlite::Connection>>,
timestamp: web::Path<i64>,
) -> Result<impl Responder> {
Ok(web::Json(ParsedPage::lookup_db(
&db.lock().unwrap(),
chrono::DateTime::from_timestamp(*timestamp, 0).unwrap(),
)))
}
#[get("/listing/{id}")]
async fn listing_get(
db: Data<Mutex<rusqlite::Connection>>,
id: web::Path<i64>,
) -> Result<impl Responder> {
Ok(web::Json(Listing::lookup(&db.lock().unwrap(), *id)))
}
#[get("/listing/since/{timestamp}/{limit}")]
async fn listing_since_get(
db: Data<Mutex<rusqlite::Connection>>,
req: web::Path<(i64, i64)>,
) -> Result<impl Responder> {
Ok(web::Json(Listing::lookup_since(
&db.lock().unwrap(),
req.0,
req.1,
)))
}
#[post("listing/parse")]
async fn parse_listings(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
let mut cnt = 0;
let db_unlocked = db.lock().unwrap();
Listing::lookup_non_parsed(&db_unlocked)
.iter()
.map(|l| parser_storage::parse_size_and_quantity(l.0, &l.1))
.inspect(|_| cnt = cnt + 1)
.for_each(|ps| ps.add_or_update_db(&db_unlocked));
Ok(web::Json(cnt))
}
#[get("listing/parse/{id}")]
async fn listing_parse_get(
db: Data<Mutex<rusqlite::Connection>>,
id: web::Path<i64>,
) -> Result<impl Responder> {
Ok(web::Json(ParsedStorage::lookup_db(
&db.lock().unwrap(),
*id,
)))
}
pub fn timestamps_from_dir(path: &Path) -> Vec<i64> {
if !std::fs::exists(path).expect("Directory must exist") {
panic!(
"Directory {:?} does not exist, cannot grab timestamps from there.",
path
);
}
std::fs::read_dir(path)
.unwrap()
.inspect(|fpath| info!("Found {:?}", fpath))
.map(|fpath| fpath.unwrap().path())
.filter_map(|fstem| {
fstem
.file_stem()
.and_then(|s| s.to_str())
.expect("Invalid file name")
.parse()
.ok()
})
.collect()
}
#[post("page/parse/{category}")]
async fn parse_post(
db: Data<Mutex<rusqlite::Connection>>,
downloaddir: Data<PathBuf>,
category: web::Path<String>,
) -> Result<impl Responder> {
let dir = &downloaddir.join(category.clone());
// Ensure the category is created.
let url: serde_json::Value =
serde_json::from_str(&std::fs::read_to_string(dir.join("url.json")).unwrap()).unwrap();
info!("{:?}", url);
let su = SearchURL {
full_url: url.to_string(),
name: category.to_string(),
};
info!("{:?}", su);
su.add_or_update(&db.lock().unwrap());
let added: u64 = timestamps_from_dir(dir)
.iter()
.filter(|t| {
info!("Checking for the existance of page {t}");
let p = ParsedPage::lookup_db(
&db.lock().unwrap(),
chrono::DateTime::from_timestamp(**t, 0).unwrap(),
);
// Timestamp never seen before, lets pass it on.
if p.is_none() {
return true;
}
// Timestamp was seen before *and* from the same catagory, don't pass
// it on.
if p.unwrap().category == *category {
return false;
}
return true;
})
.map(|t| {
let timestamp = chrono::DateTime::from_timestamp(*t, 0).unwrap();
info!("Adding or updating db with timestamp:{timestamp} catagory:{category}");
ParsedPage {
timestamp: timestamp,
category: category.to_string(),
}
.add_or_update_db(&db.lock().unwrap());
let mut cnt = 0;
parser_ebay::extract_data_from_html(
&std::fs::read_to_string(dir.join(format!("{t}.html"))).unwrap(),
&timestamp,
)
.unwrap()
.iter()
.for_each(|l| {
cnt = cnt + 1;
l.add_or_update(&db.lock().unwrap());
info!("Inserting id:{}, title:{}", l.item_id, l.title);
});
cnt
})
.sum();
Ok(added.to_string())
}
#[actix_web::main]
async fn main() -> std::io::Result<()> {
env_logger::init_from_env(env_logger::Env::new().default_filter_or("info"));
let _ = Args::parse();
let scrapedatadir = xdg_dirs::ensure_scrapedata_dir_exists("ebay_scraper", None);
info!(
"Starting with scraped data dir of \"{}\".",
scrapedatadir.to_str().unwrap()
);
let db_mutex = Data::new(Mutex::new(get_initialized(None)));
HttpServer::new(move || {
App::new()
.service(page_get)
.service(listing_get)
.service(listing_since_get)
.service(parse_post)
.service(parse_listings)
.app_data(db_mutex.clone())
.app_data(Data::new(scrapedatadir.clone()))
})
.bind(("127.0.0.1", 8080))?
.run()
.await
}

1
src/parser.rs Normal file
View File

@ -0,0 +1 @@

209
src/parser_ebay.rs Normal file
View File

@ -0,0 +1,209 @@
use crate::db::Listing;
use chrono::Utc;
use lazy_static::lazy_static;
use regex::Regex;
use scraper::{Html, Selector};
use tracing::{debug, info, warn};
lazy_static! {
static ref PRICE_REGEX: Regex = Regex::new(r"\$?([\d,]+\.?\d*)").unwrap();
static ref ITEM_ID_REGEX: Regex = Regex::new(r"/itm/(\d+)").unwrap();
static ref NEW_LISTING_REGEX: Regex = Regex::new(r"(?i)^\s*NEW LISTING\s*[:\-\s]*").unwrap();
}
/// Parses price from a string, taking the first price if it's a range.
fn parse_price(price_text: &str) -> Option<f64> {
let lower_price_text = price_text.to_lowercase();
if lower_price_text.contains(" to ") {
if let Some(first_part) = lower_price_text.split(" to ").next() {
if let Some(caps) = PRICE_REGEX.captures(first_part) {
if let Some(price_match) = caps.get(1) {
info!("Price string:{:?} parsed!", price_match);
return price_match.as_str().replace(',', "").parse().ok();
}
}
}
info!(
"Price string:{:?} failed parsing with to, returning none.",
price_text
);
return None;
}
if let Some(caps) = PRICE_REGEX.captures(price_text) {
if let Some(price_match) = caps.get(1) {
let p = price_match.as_str().replace(',', "").parse().ok();
debug!(
"price regex passed, working on caps:{:?}, price_match:{:?}, p:{:?}",
caps, price_match, p
);
return p;
}
}
info!(
"Price string:{:?} failed parsing, returning none.",
price_text
);
None
}
/// Extracts item data from HTML content.
pub fn extract_data_from_html(
html_content: &str,
timestamp: &chrono::DateTime<Utc>,
) -> Option<Vec<Listing>> {
let document = Html::parse_document(html_content);
let mut items = Vec::new();
let item_selector =
Selector::parse("li.s-item, li.srp-results__item, div.s-item[role='listitem']").unwrap();
let title_selector = Selector::parse(".s-item__title, .srp-results__title").unwrap();
let price_selector = Selector::parse(".s-item__price").unwrap();
let image_selector =
Selector::parse(".s-item__image-wrapper img.s-item__image-img, .s-item__image img")
.unwrap();
let link_selector =
Selector::parse("a.s-item__link[href*='/itm/'], .s-item__info > a[href*='/itm/']").unwrap();
let bid_count_selector =
Selector::parse(".s-item__bid-count, .s-item__bids, .s-item__bidCount").unwrap();
let primary_info_selector = Selector::parse(".s-item__detail--primary").unwrap();
let _secondary_info_selector = Selector::parse(".s-item__detail--secondary").unwrap();
let auction_bin_price_selector = Selector::parse(".s-item__buy-it-now-price").unwrap();
for element in document.select(&item_selector) {
let raw_title_text = element
.select(&title_selector)
.next()
.map(|el| el.text().collect::<String>().trim().to_string());
let price_text = element
.select(&price_selector)
.next()
.map(|el| el.text().collect::<String>().trim().to_string());
let id = element
.select(&link_selector)
.next()
.and_then(|link_el| link_el.value().attr("href"))
.and_then(|href| ITEM_ID_REGEX.captures(href))
.and_then(|caps| caps.get(1).map(|m| m.as_str().to_string()))
.and_then(|id_str| id_str.parse::<i64>().ok());
if raw_title_text.is_none() || price_text.is_none() || id.is_none() {
warn!(
"Skipping {:?} due to missing title, price, or item ID.",
element
);
continue;
}
if id.unwrap() == 123456 {
info!("Skipping {:?} due to bogus ID of 123456", element);
continue;
}
let raw_title = raw_title_text.unwrap();
let price_text = price_text.unwrap();
let title = NEW_LISTING_REGEX.replace(&raw_title, "").trim().to_string();
let primary_display_price = parse_price(&price_text);
let mut current_bid_price: Option<f64> = None;
let mut final_buy_it_now_price: Option<f64> = None;
let mut item_is_auction = false;
if let Some(bid_el) = element.select(&bid_count_selector).next() {
if bid_el
.text()
.collect::<String>()
.to_lowercase()
.contains("bid")
{
item_is_auction = true;
}
}
let has_best_offer = element
.select(&primary_info_selector)
.any(|e| e.text().any(|e| e.to_lowercase().contains("or best offer")));
if item_is_auction {
current_bid_price = primary_display_price;
if let Some(bin_el) = element.select(&auction_bin_price_selector).next() {
final_buy_it_now_price = parse_price(&bin_el.text().collect::<String>());
}
} else {
final_buy_it_now_price = primary_display_price;
}
let image_url = element
.select(&image_selector)
.next()
.and_then(|img_el| {
img_el
.value()
.attr("data-src")
.or(img_el.value().attr("src"))
})
.map(|s| s.to_string())
.unwrap();
items.push(Listing {
title,
id: 0,
item_id: id?,
added_time: *timestamp,
current_bid_price,
buy_it_now_price: final_buy_it_now_price,
has_best_offer,
image_url,
});
}
Some(items)
}
#[cfg(test)]
mod tests {
use super::*;
use similar_asserts::assert_eq;
#[test_log::test]
fn parse() {
let timestamp = chrono::DateTime::from_timestamp(1750369463, 0).unwrap();
let html = include_str!("../test_data/ebay_scraper/raw_scraped/ssd/1750369463.html");
let parsed = extract_data_from_html(html, &timestamp).unwrap();
// assert_eq!(parsed.len(), 62);
let parsed = parsed.first_chunk::<10>().unwrap();
assert_eq!(
parsed[0],
Listing {
id: 0,
item_id: 388484391867,
title: "WD Blue 2.5-Inch 3D NAND SATA SSD 1TB - WDBNCE0010PNC-WRSN".to_string(),
added_time: timestamp,
current_bid_price: None,
buy_it_now_price: Some(59.99),
has_best_offer: true,
image_url: "https://i.ebayimg.com/images/g/wQYAAeSwOTtoN8SC/s-l500.webp"
.to_string()
}
);
assert_eq!(
parsed[4],
Listing {
id: 0,
item_id: 286605201240,
title:
"Fanxiang M.2 SSD 1TB NVMe PCIe Gen 3x 4 M2 Internal Solid State Drive 3500MB/s"
.to_string(),
added_time: timestamp,
current_bid_price: Some(12.60),
buy_it_now_price: None,
has_best_offer: true,
image_url: "https://i.ebayimg.com/images/g/3NoAAeSwPrtoDb1O/s-l500.webp"
.to_string()
}
);
}
}

208
src/parser_storage.rs Normal file
View File

@ -0,0 +1,208 @@
use crate::db::ParsedStorage;
use lazy_static::lazy_static;
use regex::*;
// let parsed_size_info = crate::parser_storage::parse_size_and_quantity(&cleaned_title);
// let _cost_per_tb = if let Some(price) = primary_display_price {
// if parsed_size_info.total_tb > 0.0 {
// Some(((price / parsed_size_info.total_tb) * 100.0).round() / 100.0)
// } else {
// None
// }
// } else {
// None
// };
lazy_static! {
static ref EXPLICIT_QTY_PATTERNS: Vec<Regex> = vec![
Regex::new(r"\b(?:LOT\s+OF|LOT)\s*\(?\s*(\d+)\s*\)?").unwrap(),
Regex::new(r"\b(?:LOT\s+OF|LOT)\s*\*\s*(\d+)").unwrap(),
Regex::new(r"\b(?:PACK\s+OF|PACK|BULK)\s*\(?\s*(\d+)\s*\)?").unwrap(),
Regex::new(r"\b(\d+)\s*-\s*PACK\b").unwrap(),
Regex::new(r"\b(\d+)\s*COUNT\b").unwrap(),
];
static ref SIZE_REGEX: Regex = Regex::new(r"(\d+(?:\.\d+)?)\s*(TB|GB)\b").unwrap();
static ref SIZE_RANGE_REGEX: Regex =
Regex::new(r"\d+(?:\.\d+)?\s*(?:GB|TB)\s*(?:-|&|OR|TO)\s*\d+(?:\.\d+)?\s*(?:GB|TB)")
.unwrap();
}
/// Parses size and quantity information from an item title.
pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage {
let upper_title = title.to_uppercase();
let mut total_gb = 0i64;
let mut quantity = 1i64;
let mut needed_description_check = false;
let mut individual_size_gb = 0i64;
for pattern in EXPLICIT_QTY_PATTERNS.iter() {
if let Some(caps) = pattern.captures(&upper_title) {
if let Some(qty_match) = caps.get(1) {
if let Ok(parsed_qty) = qty_match.as_str().parse::<i64>() {
if parsed_qty > 0 && parsed_qty < 500 {
quantity = parsed_qty;
break;
}
}
}
}
}
let mut size_matches: Vec<(f64, String)> = Vec::new();
for caps in SIZE_REGEX.captures_iter(&upper_title) {
if let (Some(val_str), Some(unit_str)) = (caps.get(1), caps.get(2)) {
if let Ok(val) = val_str.as_str().parse::<f64>() {
size_matches.push((val, unit_str.as_str().to_string()));
}
}
}
if !size_matches.is_empty() {
let mut unique_sizes_gb: Vec<i64> = size_matches
.iter()
.map(|(val, unit)| { if unit == "TB" { *val * 1024.0 } else { *val } } as i64)
.collect();
unique_sizes_gb.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
unique_sizes_gb.dedup();
if !unique_sizes_gb.is_empty() {
individual_size_gb = unique_sizes_gb[0];
if unique_sizes_gb.len() > 1 {
needed_description_check = true;
}
}
}
if SIZE_RANGE_REGEX.is_match(&upper_title) {
needed_description_check = true;
}
if quantity > 1 && upper_title.contains("MIXED") {
needed_description_check = true;
}
if upper_title.contains("CHECK THE DESCRIPTION")
|| upper_title.contains("CHECK DESCRIPTION")
|| upper_title.contains("SEE DESCRIPTION")
{
if quantity > 1 || size_matches.is_empty() || size_matches.len() > 1 {
needed_description_check = true;
}
}
if individual_size_gb > 0 {
total_gb = individual_size_gb * quantity;
}
if quantity > 1 && total_gb == 0 && !size_matches.is_empty() {
needed_description_check = true;
}
if quantity == 1 && size_matches.len() == 1 && !needed_description_check {
// This condition is implicitly handled
}
ParsedStorage {
id: 0,
item: item_id,
total_gigabytes: total_gb,
quantity,
individual_size_gigabytes: individual_size_gb,
needed_description_check,
parse_engine: 0,
}
}
#[cfg(test)]
mod tests {
use super::*;
use similar_asserts::assert_eq;
static TESTS: &[(&str, ParsedStorage, bool)] = &[
(
"Lot Of 3 Western Digital PC SN740 512GB M.2 2230 NVMe Internal SSD",
ParsedStorage {
id: 0,
item: 0,
total_gigabytes: 512 * 3,
quantity: 3,
individual_size_gigabytes: 512,
parse_engine: 0,
needed_description_check: false,
},
true,
),
(
"Samsung SM883 2.5” 240GB SATA 6Gbps MZ7KH240HAHQ-00005",
ParsedStorage {
id: 0,
item: 1,
total_gigabytes: 240,
quantity: 1,
individual_size_gigabytes: 240,
parse_engine: 0,
needed_description_check: false,
},
true,
),
(
"1TB AData SU650 2.5-inch SATA 6Gb/s SSD Solid State Disk (READ)",
ParsedStorage {
id: 0,
item: 2,
total_gigabytes: 1024,
quantity: 1,
individual_size_gigabytes: 1024,
parse_engine: 0,
needed_description_check: true,
},
false, // Sadly this one fails :/
),
(
"Hitachi VSP 7TB Flash Module Drive (FMD) 3286734-A DKC-F810I-7R0FP",
ParsedStorage {
id: 0,
item: 4,
total_gigabytes: 7 * 1024,
quantity: 1,
individual_size_gigabytes: 7 * 1024,
parse_engine: 0,
needed_description_check: false,
},
true,
),
(
"(Lot of 6) Samsung MZ-VLB2560 256GB M.2 NVMe Internal SSD (MZVLB256HBHQ-000H1)",
ParsedStorage {
id: 0,
item: 5,
total_gigabytes: 6 * 256,
quantity: 6,
individual_size_gigabytes: 256,
parse_engine: 0,
needed_description_check: false,
},
true,
),
(
"Lenovo-Micron 5300 Pro 1.92TB 2.5\" Sata SSD 02JG540 (MTFDDAK1T9TDS)",
ParsedStorage {
id: 0,
item: 6,
total_gigabytes: 1966,
quantity: 1,
individual_size_gigabytes: 1966,
parse_engine: 0,
needed_description_check: false,
},
true,
),
];
#[test_log::test]
fn parse() {
for t in TESTS {
if t.2 {
assert_eq!(t.1, parse_size_and_quantity(t.1.item, t.0));
}
}
}
}

74
src/xdg_dirs.rs Normal file
View File

@ -0,0 +1,74 @@
use std::path::PathBuf;
use tracing::{info, warn};
enum XdgType {
XdgDataHome,
#[allow(dead_code)]
XdgConfigHome,
#[allow(dead_code)]
XdgCacheHome,
#[allow(dead_code)]
XdgStateHome,
#[allow(dead_code)]
XdgRuntimeDir,
}
fn ensure_xdg_dir_exists(app_name: &str, xdg_type: XdgType) -> PathBuf {
let dir = match xdg_type {
XdgType::XdgDataHome => dirs::data_local_dir().unwrap_or_else(|| {
let d = dirs::home_dir().unwrap().join(".local").join(".share");
warn!(
"OS returned no data local dir (XDG_DATA_HOME), HOME is {:?}, using {:?}!",
dirs::home_dir(),
d
);
d
}),
XdgType::XdgConfigHome => todo!("Not yet tested/implimented"),
XdgType::XdgCacheHome => todo!("Not yet tested/implimented"),
XdgType::XdgStateHome => todo!("Not yet tested/implimented"),
XdgType::XdgRuntimeDir => todo!("Not yet tested/implimented"),
};
if !std::fs::exists(&dir).unwrap() {
panic!(
"Base directory to use for this app does not exist at {:?}",
dir
);
}
let dir = dir.join(app_name);
if !std::fs::exists(&dir).unwrap() {
info!(
"App directory to use for this app does not exist at {:?}, creating now.",
dir
);
std::fs::create_dir(&dir).unwrap();
}
dir
}
pub fn ensure_scrapedata_dir_exists(app_name: &str, override_path: Option<PathBuf>) -> PathBuf {
if override_path.is_some() {
if !std::fs::exists(&override_path.clone().unwrap()).unwrap() {
panic!(
"Override path of {:?} was given but does not eixst, bailing.",
override_path.unwrap()
);
}
return override_path.unwrap();
}
let app_dir = ensure_xdg_dir_exists(app_name, XdgType::XdgDataHome);
let raw_scraped_dir = override_path.unwrap_or(app_dir.join("raw_scraped"));
if !std::fs::exists(&raw_scraped_dir).unwrap() {
info!(
"scrape directory to use for this app does not exist at {:?}, creating now.",
raw_scraped_dir
);
std::fs::create_dir(&raw_scraped_dir).unwrap();
}
raw_scraped_dir
}