Rust cleanup (separate files, idiomatic rust, etc)
This commit is contained in:
699
ebay_storage/rust/Cargo.lock
generated
699
ebay_storage/rust/Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,19 +1,21 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "ebay_scraper_rust"
|
name = "ebay_scraper_rust"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
edition = "2021"
|
edition = "2024"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
clap = { version = "4.4", features = ["derive"] }
|
clap = { version = "4.5.39", features = ["derive"] }
|
||||||
reqwest = { version = "0.11", features = ["json", "stream"] } # Removed "blocking" as we use tokio
|
reqwest = { version = "0.12.18", features = ["json", "stream"] } # Removed "blocking" as we use tokio
|
||||||
scraper = "0.18"
|
scraper = "0.23.1"
|
||||||
serde = { version = "1.0", features = ["derive"] }
|
serde = { version = "1.0.219", features = ["derive"] }
|
||||||
serde_json = "1.0"
|
serde_json = "1.0.140"
|
||||||
regex = "1.10"
|
regex = "1.11.1"
|
||||||
tokio = { version = "1", features = ["full"] }
|
tokio = { version = "1.45.1", features = ["full"] }
|
||||||
url = "2.5"
|
url = "2.5.4"
|
||||||
# path-slash is not strictly needed if using std::path::PathBuf correctly
|
# path-slash is not strictly needed if using std::path::PathBuf correctly
|
||||||
bytes = "1.5"
|
bytes = "1.10.1"
|
||||||
chrono = { version = "0.4", features = ["serde"] }
|
chrono = { version = "0.4.41", features = ["serde"] }
|
||||||
lazy_static = "1.4.0"
|
lazy_static = "1.5.0"
|
||||||
futures = "0.3" # For join_all on async tasks
|
futures = "0.3.31" # For join_all on async tasks
|
||||||
|
tracing = "0.1.41"
|
||||||
|
tracing-subscriber = { version = "0.3.19", features = ["fmt"] }
|
||||||
|
45
ebay_storage/rust/src/cli.rs
Normal file
45
ebay_storage/rust/src/cli.rs
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
// src/cli.rs
|
||||||
|
use clap::Parser;
|
||||||
|
|
||||||
|
#[derive(Parser, Debug)]
|
||||||
|
#[clap(
|
||||||
|
name = "ebay-scraper-rust",
|
||||||
|
version = "0.1.0",
|
||||||
|
about = "Scrapes eBay search results for SSD/HDD cost per TB."
|
||||||
|
)]
|
||||||
|
pub struct Cli {
|
||||||
|
#[clap(subcommand)]
|
||||||
|
pub command: Option<Commands>,
|
||||||
|
|
||||||
|
/// The full eBay search URL to scrape.
|
||||||
|
pub url: Option<String>,
|
||||||
|
|
||||||
|
/// Save scraped HTML to a file (and download images if fetching from URL).
|
||||||
|
#[clap(long)]
|
||||||
|
pub save: Option<String>,
|
||||||
|
|
||||||
|
/// Load HTML from a file (disables network). Image download will not occur with --load.
|
||||||
|
#[clap(long)]
|
||||||
|
pub load: Option<String>,
|
||||||
|
|
||||||
|
/// Suppress informational logs, output only final JSON.
|
||||||
|
#[clap(long)]
|
||||||
|
pub only_json: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Parser, Debug)]
|
||||||
|
pub enum Commands {
|
||||||
|
/// Scrapes latest listings.
|
||||||
|
Latest(LatestArgs),
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Parser, Debug)]
|
||||||
|
pub struct LatestArgs {
|
||||||
|
/// Items per page (60, 120, or 240)
|
||||||
|
#[clap(long, default_value = "60")]
|
||||||
|
pub per_page: String,
|
||||||
|
|
||||||
|
/// Minimum cost (e.g., 50.00)
|
||||||
|
#[clap(long, default_value = "0.00")]
|
||||||
|
pub minimum_cost: f64,
|
||||||
|
}
|
250
ebay_storage/rust/src/html_utils.rs
Normal file
250
ebay_storage/rust/src/html_utils.rs
Normal file
@ -0,0 +1,250 @@
|
|||||||
|
// src/html_utils.rs
|
||||||
|
use chrono::Utc;
|
||||||
|
use lazy_static::lazy_static;
|
||||||
|
use regex::Regex;
|
||||||
|
use scraper::{Html, Selector};
|
||||||
|
use std::error::Error as StdError; // Use the same alias as main
|
||||||
|
use std::fs::{self, File};
|
||||||
|
use std::io::Write;
|
||||||
|
use std::path::Path;
|
||||||
|
use tracing::{error, info, warn};
|
||||||
|
use url::Url;
|
||||||
|
|
||||||
|
use super::item::{EbayItem, ParsedItemData};
|
||||||
|
use super::parser_utils;
|
||||||
|
|
||||||
|
// Define or import AppError to match main.rs
|
||||||
|
type AppError = Box<dyn StdError + Send + Sync + 'static>;
|
||||||
|
|
||||||
|
const PARSER_ENGINE_VERSION: i32 = 1;
|
||||||
|
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36";
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref ITEM_ID_REGEX: Regex = Regex::new(r"/itm/(\d+)").unwrap();
|
||||||
|
static ref NEW_LISTING_REGEX: Regex = Regex::new(r"(?i)^\s*NEW LISTING\s*[:\-\s]*").unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Fetches HTML content from a URL.
|
||||||
|
pub async fn fetch_html(url: &str) -> Result<String, AppError> {
|
||||||
|
info!(target_url = url, "Navigating to URL");
|
||||||
|
let client = reqwest::Client::builder().user_agent(USER_AGENT).build()?;
|
||||||
|
let response = client.get(url).send().await?;
|
||||||
|
if !response.status().is_success() {
|
||||||
|
let err_msg = format!(
|
||||||
|
"Failed to fetch URL: {} - Status: {}",
|
||||||
|
url,
|
||||||
|
response.status()
|
||||||
|
);
|
||||||
|
error!(error_message = %err_msg, "URL fetch failed");
|
||||||
|
return Err(err_msg.into());
|
||||||
|
}
|
||||||
|
let html_content = response.text().await?;
|
||||||
|
info!(
|
||||||
|
target_url = url,
|
||||||
|
"Navigation successful. Page content retrieved."
|
||||||
|
);
|
||||||
|
Ok(html_content)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Extracts item data from HTML content.
|
||||||
|
pub fn extract_data_from_html(html_content: &str) -> Result<Vec<EbayItem>, AppError> {
|
||||||
|
let document = Html::parse_document(html_content);
|
||||||
|
let mut items = Vec::new();
|
||||||
|
let today = Utc::now();
|
||||||
|
|
||||||
|
// MODIFIED: Using .unwrap() for Selector::parse() calls as requested
|
||||||
|
let item_selector =
|
||||||
|
Selector::parse("li.s-item, li.srp-results__item, div.s-item[role='listitem']").unwrap();
|
||||||
|
let title_selector = Selector::parse(".s-item__title, .srp-results__title").unwrap();
|
||||||
|
let price_selector = Selector::parse(".s-item__price").unwrap();
|
||||||
|
let image_selector =
|
||||||
|
Selector::parse(".s-item__image-wrapper img.s-item__image-img, .s-item__image img")
|
||||||
|
.unwrap();
|
||||||
|
let link_selector =
|
||||||
|
Selector::parse("a.s-item__link[href*='/itm/'], .s-item__info > a[href*='/itm/']").unwrap();
|
||||||
|
let bid_count_selector = Selector::parse(".s-item__bid-count").unwrap();
|
||||||
|
let best_offer_selector =
|
||||||
|
Selector::parse(".s-item__purchase-options--bo, .s-item__best-offer").unwrap();
|
||||||
|
let secondary_info_selector =
|
||||||
|
Selector::parse(".s-item__subtitle, .s-item__secondary-text, .s-item__detail--secondary")
|
||||||
|
.unwrap();
|
||||||
|
let auction_bin_price_selector = Selector::parse(".s-item__buy-it-now-price").unwrap();
|
||||||
|
|
||||||
|
for element in document.select(&item_selector) {
|
||||||
|
let raw_title_text = element
|
||||||
|
.select(&title_selector)
|
||||||
|
.next()
|
||||||
|
.map(|el| el.text().collect::<String>().trim().to_string());
|
||||||
|
let price_text = element
|
||||||
|
.select(&price_selector)
|
||||||
|
.next()
|
||||||
|
.map(|el| el.text().collect::<String>().trim().to_string());
|
||||||
|
|
||||||
|
let item_id = element
|
||||||
|
.select(&link_selector)
|
||||||
|
.next()
|
||||||
|
.and_then(|link_el| link_el.value().attr("href"))
|
||||||
|
.and_then(|href| ITEM_ID_REGEX.captures(href))
|
||||||
|
.and_then(|caps| caps.get(1).map(|m| m.as_str().to_string()));
|
||||||
|
|
||||||
|
if raw_title_text.is_none() || price_text.is_none() || item_id.is_none() {
|
||||||
|
warn!("Skipping item due to missing title, price, or item ID.");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let raw_title = raw_title_text.unwrap();
|
||||||
|
let price_text = price_text.unwrap();
|
||||||
|
let item_id = item_id.unwrap();
|
||||||
|
|
||||||
|
let cleaned_title = NEW_LISTING_REGEX.replace(&raw_title, "").trim().to_string();
|
||||||
|
|
||||||
|
let primary_display_price = parser_utils::parse_price(&price_text);
|
||||||
|
|
||||||
|
let mut current_bid_price: Option<f64> = None;
|
||||||
|
let mut final_buy_it_now_price: Option<f64> = None;
|
||||||
|
let mut has_best_offer = false;
|
||||||
|
let mut item_is_auction = false;
|
||||||
|
|
||||||
|
if let Some(bid_el) = element.select(&bid_count_selector).next() {
|
||||||
|
if bid_el
|
||||||
|
.text()
|
||||||
|
.collect::<String>()
|
||||||
|
.to_lowercase()
|
||||||
|
.contains("bid")
|
||||||
|
{
|
||||||
|
item_is_auction = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if element.select(&best_offer_selector).next().is_some() {
|
||||||
|
has_best_offer = true;
|
||||||
|
} else {
|
||||||
|
for el in element.select(&secondary_info_selector) {
|
||||||
|
if el
|
||||||
|
.text()
|
||||||
|
.collect::<String>()
|
||||||
|
.to_lowercase()
|
||||||
|
.contains("or best offer")
|
||||||
|
{
|
||||||
|
has_best_offer = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if item_is_auction {
|
||||||
|
current_bid_price = primary_display_price;
|
||||||
|
if let Some(bin_el) = element.select(&auction_bin_price_selector).next() {
|
||||||
|
final_buy_it_now_price =
|
||||||
|
parser_utils::parse_price(&bin_el.text().collect::<String>());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
final_buy_it_now_price = primary_display_price;
|
||||||
|
}
|
||||||
|
|
||||||
|
let image_url_val = element
|
||||||
|
.select(&image_selector)
|
||||||
|
.next()
|
||||||
|
.and_then(|img_el| {
|
||||||
|
img_el
|
||||||
|
.value()
|
||||||
|
.attr("data-src")
|
||||||
|
.or(img_el.value().attr("src"))
|
||||||
|
})
|
||||||
|
.map(|s| s.to_string());
|
||||||
|
|
||||||
|
let parsed_size_info = parser_utils::parse_size_and_quantity(&cleaned_title);
|
||||||
|
|
||||||
|
let cost_per_tb = if let Some(price) = primary_display_price {
|
||||||
|
if parsed_size_info.total_tb > 0.0 {
|
||||||
|
Some(((price / parsed_size_info.total_tb) * 100.0).round() / 100.0)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
let parsed_data = ParsedItemData {
|
||||||
|
item_count: parsed_size_info.quantity,
|
||||||
|
size_per_item_tb: if parsed_size_info.individual_size_tb > 0.0 {
|
||||||
|
Some(parsed_size_info.individual_size_tb)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
},
|
||||||
|
total_tb: if parsed_size_info.total_tb > 0.0 {
|
||||||
|
Some(parsed_size_info.total_tb)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
},
|
||||||
|
cost_per_tb,
|
||||||
|
needed_description_check: parsed_size_info.needed_description_check,
|
||||||
|
parser_engine: PARSER_ENGINE_VERSION,
|
||||||
|
};
|
||||||
|
|
||||||
|
items.push(EbayItem {
|
||||||
|
title: cleaned_title,
|
||||||
|
item_id,
|
||||||
|
date_found: today,
|
||||||
|
current_bid_price,
|
||||||
|
buy_it_now_price: final_buy_it_now_price,
|
||||||
|
has_best_offer,
|
||||||
|
image_url: image_url_val,
|
||||||
|
parsed: parsed_data,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
Ok(items)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Downloads an image from a URL and saves it, preserving path structure.
|
||||||
|
pub async fn download_image(
|
||||||
|
image_url_str: &str,
|
||||||
|
base_save_directory: &Path,
|
||||||
|
) -> Result<(), AppError> {
|
||||||
|
if image_url_str.is_empty() {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
let parsed_url = Url::parse(image_url_str).map_err(|e| Box::new(e) as AppError)?;
|
||||||
|
|
||||||
|
let image_path_from_url = parsed_url.path().trim_start_matches('/');
|
||||||
|
if image_path_from_url.is_empty() {
|
||||||
|
return Err(Box::from("Image URL has no path component") as AppError);
|
||||||
|
}
|
||||||
|
|
||||||
|
let full_local_image_path = base_save_directory.join(image_path_from_url);
|
||||||
|
|
||||||
|
if let Some(parent_dir) = full_local_image_path.parent() {
|
||||||
|
fs::create_dir_all(parent_dir).map_err(|e| Box::new(e) as AppError)?;
|
||||||
|
info!(path = %parent_dir.display(), "Ensured image directory exists");
|
||||||
|
}
|
||||||
|
|
||||||
|
let client = reqwest::Client::builder()
|
||||||
|
.user_agent(USER_AGENT)
|
||||||
|
.build()
|
||||||
|
.map_err(|e| Box::new(e) as AppError)?;
|
||||||
|
let response = client
|
||||||
|
.get(image_url_str)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.map_err(|e| Box::new(e) as AppError)?;
|
||||||
|
|
||||||
|
if !response.status().is_success() {
|
||||||
|
let err_msg = format!(
|
||||||
|
"Failed to download image {}. Status: {}",
|
||||||
|
image_url_str,
|
||||||
|
response.status()
|
||||||
|
);
|
||||||
|
return Err(Box::from(err_msg) as AppError);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut file = File::create(&full_local_image_path).map_err(|e| Box::new(e) as AppError)?;
|
||||||
|
let content = response
|
||||||
|
.bytes()
|
||||||
|
.await
|
||||||
|
.map_err(|e| Box::new(e) as AppError)?;
|
||||||
|
file.write_all(&content)
|
||||||
|
.map_err(|e| Box::new(e) as AppError)?;
|
||||||
|
|
||||||
|
info!(path = %full_local_image_path.display(), "Downloaded image");
|
||||||
|
Ok(())
|
||||||
|
}
|
45
ebay_storage/rust/src/item.rs
Normal file
45
ebay_storage/rust/src/item.rs
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
// src/item.rs
|
||||||
|
use chrono::{DateTime, Utc};
|
||||||
|
use serde::Serialize;
|
||||||
|
|
||||||
|
#[derive(Serialize, Debug)]
|
||||||
|
pub struct EbayItem {
|
||||||
|
pub title: String,
|
||||||
|
#[serde(rename = "itemId")]
|
||||||
|
pub item_id: String,
|
||||||
|
#[serde(rename = "dateFound")]
|
||||||
|
pub date_found: DateTime<Utc>,
|
||||||
|
#[serde(rename = "currentBidPrice")]
|
||||||
|
pub current_bid_price: Option<f64>,
|
||||||
|
#[serde(rename = "buyItNowPrice", skip_serializing_if = "Option::is_none")]
|
||||||
|
pub buy_it_now_price: Option<f64>,
|
||||||
|
#[serde(rename = "hasBestOffer")]
|
||||||
|
pub has_best_offer: bool,
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
pub image_url: Option<String>,
|
||||||
|
pub parsed: ParsedItemData,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize, Debug)]
|
||||||
|
pub struct ParsedItemData {
|
||||||
|
#[serde(rename = "itemCount")]
|
||||||
|
pub item_count: i32,
|
||||||
|
#[serde(rename = "sizePerItemTB")]
|
||||||
|
pub size_per_item_tb: Option<f64>,
|
||||||
|
#[serde(rename = "totalTB")]
|
||||||
|
pub total_tb: Option<f64>,
|
||||||
|
#[serde(rename = "costPerTB")]
|
||||||
|
pub cost_per_tb: Option<f64>,
|
||||||
|
#[serde(rename = "needed_description_check")]
|
||||||
|
pub needed_description_check: bool,
|
||||||
|
#[serde(rename = "parser_engine")]
|
||||||
|
pub parser_engine: i32,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct SizeQuantityInfo {
|
||||||
|
pub total_tb: f64,
|
||||||
|
pub quantity: i32,
|
||||||
|
pub individual_size_tb: f64,
|
||||||
|
pub needed_description_check: bool,
|
||||||
|
}
|
@ -1,426 +1,66 @@
|
|||||||
// main.rs
|
// src/main.rs
|
||||||
|
mod cli;
|
||||||
|
mod item;
|
||||||
|
mod parser_utils;
|
||||||
|
mod html_utils;
|
||||||
|
|
||||||
// Import necessary crates
|
|
||||||
use clap::Parser;
|
use clap::Parser;
|
||||||
use regex::Regex;
|
use std::fs;
|
||||||
use scraper::{Html, Selector};
|
|
||||||
use serde::Serialize;
|
|
||||||
use std::fs::{self, File};
|
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
use std::path::{Path, PathBuf};
|
use std::path::PathBuf;
|
||||||
use std::error::Error;
|
use std::error::Error as StdError;
|
||||||
use chrono::{DateTime, Utc};
|
use tracing::{info, error, warn, Level};
|
||||||
use lazy_static::lazy_static;
|
use tracing_subscriber;
|
||||||
use url::Url;
|
|
||||||
|
|
||||||
// Define constants
|
use cli::{Cli, Commands};
|
||||||
const PARSER_ENGINE_VERSION: i32 = 1;
|
use item::EbayItem;
|
||||||
const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36";
|
|
||||||
|
|
||||||
// --- Lazy static Regex definitions ---
|
// Define a more specific error type for the application
|
||||||
lazy_static! {
|
// This type is now implicitly used by html_utils.rs as well due to function signatures.
|
||||||
// Regex for parsing quantity from title (e.g., "LOT OF 10", "5-PACK")
|
type AppError = Box<dyn StdError + Send + Sync + 'static>;
|
||||||
static ref EXPLICIT_QTY_PATTERNS: Vec<Regex> = vec![
|
|
||||||
Regex::new(r"\b(?:LOT\s+OF|LOT)\s*\(?\s*(\d+)\s*\)?").unwrap(),
|
|
||||||
Regex::new(r"\b(?:LOT\s+OF|LOT)\s*\*\s*(\d+)").unwrap(),
|
|
||||||
Regex::new(r"\b(?:PACK\s+OF|PACK|BULK)\s*\(?\s*(\d+)\s*\)?").unwrap(),
|
|
||||||
Regex::new(r"\b(\d+)\s*-\s*PACK\b").unwrap(),
|
|
||||||
Regex::new(r"\b(\d+)\s*COUNT\b").unwrap(),
|
|
||||||
];
|
|
||||||
// Regex for parsing size from title (e.g., "500GB", "2TB")
|
|
||||||
static ref SIZE_REGEX: Regex = Regex::new(r"(\d+(?:\.\d+)?)\s*(TB|GB)\b").unwrap();
|
|
||||||
// Regex for titles indicating a range of sizes or mixed items
|
|
||||||
static ref SIZE_RANGE_REGEX: Regex = Regex::new(r"\d+(?:\.\d+)?\s*(?:GB|TB)\s*(?:-|&|OR|TO)\s*\d+(?:\.\d+)?\s*(?:GB|TB)").unwrap();
|
|
||||||
// Regex for extracting item ID from URL
|
|
||||||
static ref ITEM_ID_REGEX: Regex = Regex::new(r"/itm/(\d+)").unwrap();
|
|
||||||
// Regex for parsing price, potentially a range
|
|
||||||
static ref PRICE_REGEX: Regex = Regex::new(r"\$?([\d,]+\.?\d*)").unwrap();
|
|
||||||
// Regex for "NEW LISTING" prefix - case-insensitive to better match JS /i flag
|
|
||||||
static ref NEW_LISTING_REGEX: Regex = Regex::new(r"(?i)^\s*NEW LISTING\s*[:\-\s]*").unwrap();
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- Command Line Argument Parsing (using clap) ---
|
|
||||||
#[derive(Parser, Debug)]
|
|
||||||
#[clap(name = "ebay-scraper-rust", version = "0.1.0", about = "Scrapes eBay search results for SSD/HDD cost per TB.")]
|
|
||||||
struct Cli {
|
|
||||||
#[clap(subcommand)]
|
|
||||||
command: Option<Commands>,
|
|
||||||
|
|
||||||
/// The full eBay search URL to scrape.
|
|
||||||
url: Option<String>,
|
|
||||||
|
|
||||||
/// Save scraped HTML to a file (and download images if fetching from URL).
|
|
||||||
#[clap(long)]
|
|
||||||
save: Option<String>,
|
|
||||||
|
|
||||||
/// Load HTML from a file (disables network). Image download will not occur with --load.
|
|
||||||
#[clap(long)]
|
|
||||||
load: Option<String>,
|
|
||||||
|
|
||||||
/// Suppress informational logs, output only final JSON.
|
|
||||||
#[clap(long)]
|
|
||||||
only_json: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Parser, Debug)]
|
|
||||||
enum Commands {
|
|
||||||
/// Scrapes latest listings.
|
|
||||||
Latest(LatestArgs),
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Parser, Debug)]
|
|
||||||
struct LatestArgs {
|
|
||||||
/// Items per page (60, 120, or 240)
|
|
||||||
#[clap(long, default_value = "60")]
|
|
||||||
per_page: String, // Keep as string for validation
|
|
||||||
|
|
||||||
/// Minimum cost (e.g., 50.00)
|
|
||||||
#[clap(long, default_value = "0.00")]
|
|
||||||
minimum_cost: f64,
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- Data Structures for Scraped Items (using serde) ---
|
|
||||||
#[derive(Serialize, Debug)]
|
|
||||||
struct EbayItem {
|
|
||||||
title: String,
|
|
||||||
#[serde(rename = "itemId")]
|
|
||||||
item_id: String,
|
|
||||||
#[serde(rename = "dateFound")]
|
|
||||||
date_found: DateTime<Utc>,
|
|
||||||
#[serde(rename = "currentBidPrice")]
|
|
||||||
current_bid_price: Option<f64>,
|
|
||||||
#[serde(rename = "buyItNowPrice", skip_serializing_if = "Option::is_none")] // Keep skip for this one if JS does it
|
|
||||||
buy_it_now_price: Option<f64>,
|
|
||||||
#[serde(rename = "hasBestOffer")]
|
|
||||||
has_best_offer: bool,
|
|
||||||
#[serde(skip_serializing_if = "Option::is_none")] // Keep skip for this one if JS does it
|
|
||||||
image_url: Option<String>,
|
|
||||||
parsed: ParsedItemData,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Debug)]
|
|
||||||
struct ParsedItemData {
|
|
||||||
#[serde(rename = "itemCount")]
|
|
||||||
item_count: i32,
|
|
||||||
// MODIFIED: Removed skip_serializing_if to always include the field, even if null
|
|
||||||
#[serde(rename = "sizePerItemTB")]
|
|
||||||
size_per_item_tb: Option<f64>,
|
|
||||||
#[serde(rename = "totalTB")]
|
|
||||||
total_tb: Option<f64>,
|
|
||||||
#[serde(rename = "costPerTB")]
|
|
||||||
cost_per_tb: Option<f64>,
|
|
||||||
#[serde(rename = "needed_description_check")]
|
|
||||||
needed_description_check: bool,
|
|
||||||
#[serde(rename = "parser_engine")]
|
|
||||||
parser_engine: i32,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
struct SizeQuantityInfo {
|
|
||||||
total_tb: f64,
|
|
||||||
quantity: i32,
|
|
||||||
individual_size_tb: f64,
|
|
||||||
needed_description_check: bool,
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- Logging ---
|
|
||||||
fn log_message(message: &str, quiet_mode: bool) {
|
|
||||||
if !quiet_mode {
|
|
||||||
eprintln!("{}", message);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn log_error(message: &str, quiet_mode: bool) {
|
|
||||||
if !quiet_mode {
|
|
||||||
eprintln!("ERROR: {}", message);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// --- Parsing Logic ---
|
|
||||||
mod parser {
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
/// Parses size and quantity information from an item title.
|
|
||||||
pub fn parse_size_and_quantity(title: &str) -> SizeQuantityInfo {
|
|
||||||
let upper_title = title.to_uppercase();
|
|
||||||
let mut total_tb = 0.0;
|
|
||||||
let mut quantity = 1;
|
|
||||||
let mut needed_description_check = false;
|
|
||||||
let mut individual_size_tb = 0.0;
|
|
||||||
|
|
||||||
for pattern in EXPLICIT_QTY_PATTERNS.iter() {
|
|
||||||
if let Some(caps) = pattern.captures(&upper_title) {
|
|
||||||
if let Some(qty_match) = caps.get(1) {
|
|
||||||
if let Ok(parsed_qty) = qty_match.as_str().parse::<i32>() {
|
|
||||||
if parsed_qty > 0 && parsed_qty < 500 {
|
|
||||||
quantity = parsed_qty;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut size_matches: Vec<(f64, String)> = Vec::new();
|
|
||||||
for caps in SIZE_REGEX.captures_iter(&upper_title) {
|
|
||||||
if let (Some(val_str), Some(unit_str)) = (caps.get(1), caps.get(2)) {
|
|
||||||
if let Ok(val) = val_str.as_str().parse::<f64>() {
|
|
||||||
size_matches.push((val, unit_str.as_str().to_string()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if !size_matches.is_empty() {
|
|
||||||
let mut unique_sizes_tb: Vec<f64> = size_matches.iter()
|
|
||||||
.map(|(val, unit)| if unit == "GB" { *val / 1000.0 } else { *val })
|
|
||||||
.collect();
|
|
||||||
unique_sizes_tb.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
|
|
||||||
unique_sizes_tb.dedup();
|
|
||||||
|
|
||||||
if !unique_sizes_tb.is_empty() {
|
|
||||||
individual_size_tb = unique_sizes_tb[0];
|
|
||||||
if unique_sizes_tb.len() > 1 {
|
|
||||||
needed_description_check = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if SIZE_RANGE_REGEX.is_match(&upper_title) {
|
|
||||||
needed_description_check = true;
|
|
||||||
}
|
|
||||||
if quantity > 1 && upper_title.contains("MIXED") {
|
|
||||||
needed_description_check = true;
|
|
||||||
}
|
|
||||||
if upper_title.contains("CHECK THE DESCRIPTION") || upper_title.contains("CHECK DESCRIPTION") || upper_title.contains("SEE DESCRIPTION") {
|
|
||||||
if quantity > 1 || size_matches.is_empty() || size_matches.len() > 1 {
|
|
||||||
needed_description_check = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if individual_size_tb > 0.0 {
|
|
||||||
total_tb = individual_size_tb * quantity as f64;
|
|
||||||
}
|
|
||||||
|
|
||||||
if quantity > 1 && total_tb == 0.0 && !size_matches.is_empty() {
|
|
||||||
needed_description_check = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if quantity == 1 && size_matches.len() == 1 && !needed_description_check {
|
|
||||||
// This condition is implicitly handled
|
|
||||||
}
|
|
||||||
|
|
||||||
SizeQuantityInfo {
|
|
||||||
total_tb: (total_tb * 10000.0).round() / 10000.0,
|
|
||||||
quantity,
|
|
||||||
individual_size_tb: (individual_size_tb * 10000.0).round() / 10000.0,
|
|
||||||
needed_description_check,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Parses price from a string, taking the first price if it's a range.
|
|
||||||
pub fn parse_price(price_text: &str) -> Option<f64> {
|
|
||||||
let lower_price_text = price_text.to_lowercase();
|
|
||||||
if lower_price_text.contains(" to ") {
|
|
||||||
if let Some(first_part) = lower_price_text.split(" to ").next() {
|
|
||||||
if let Some(caps) = PRICE_REGEX.captures(first_part) {
|
|
||||||
if let Some(price_match) = caps.get(1) {
|
|
||||||
return price_match.as_str().replace(',', "").parse().ok();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(caps) = PRICE_REGEX.captures(price_text) {
|
|
||||||
if let Some(price_match) = caps.get(1) {
|
|
||||||
return price_match.as_str().replace(',', "").parse().ok();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// --- HTML Scraping Logic ---
|
|
||||||
mod html_scraper {
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
/// Extracts item data from HTML content.
|
|
||||||
pub fn extract_data_from_html(html_content: &str, quiet_mode: bool) -> Result<Vec<EbayItem>, Box<dyn Error>> {
|
|
||||||
let document = Html::parse_document(html_content);
|
|
||||||
let mut items = Vec::new();
|
|
||||||
let today = Utc::now();
|
|
||||||
|
|
||||||
let item_selector = Selector::parse("li.s-item, li.srp-results__item, div.s-item[role='listitem']").unwrap();
|
|
||||||
let title_selector = Selector::parse(".s-item__title, .srp-results__title").unwrap();
|
|
||||||
let price_selector = Selector::parse(".s-item__price").unwrap();
|
|
||||||
let image_selector = Selector::parse(".s-item__image-wrapper img.s-item__image-img, .s-item__image img").unwrap();
|
|
||||||
let link_selector = Selector::parse("a.s-item__link[href*='/itm/'], .s-item__info > a[href*='/itm/']").unwrap();
|
|
||||||
let bid_count_selector = Selector::parse(".s-item__bid-count").unwrap();
|
|
||||||
let best_offer_selector = Selector::parse(".s-item__purchase-options--bo, .s-item__best-offer").unwrap();
|
|
||||||
let secondary_info_selector = Selector::parse(".s-item__subtitle, .s-item__secondary-text, .s-item__detail--secondary").unwrap();
|
|
||||||
let auction_bin_price_selector = Selector::parse(".s-item__buy-it-now-price").unwrap();
|
|
||||||
|
|
||||||
for element in document.select(&item_selector) {
|
|
||||||
let raw_title_text = element.select(&title_selector).next().map(|el| el.text().collect::<String>().trim().to_string());
|
|
||||||
let price_text = element.select(&price_selector).next().map(|el| el.text().collect::<String>().trim().to_string());
|
|
||||||
|
|
||||||
let item_id = element.select(&link_selector).next()
|
|
||||||
.and_then(|link_el| link_el.value().attr("href"))
|
|
||||||
.and_then(|href| ITEM_ID_REGEX.captures(href))
|
|
||||||
.and_then(|caps| caps.get(1).map(|m| m.as_str().to_string()));
|
|
||||||
|
|
||||||
if raw_title_text.is_none() || price_text.is_none() || item_id.is_none() {
|
|
||||||
log_message("Skipping item due to missing title, price, or item ID.", quiet_mode);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
let raw_title = raw_title_text.unwrap();
|
|
||||||
let price_text = price_text.unwrap();
|
|
||||||
let item_id = item_id.unwrap();
|
|
||||||
|
|
||||||
let cleaned_title = NEW_LISTING_REGEX.replace(&raw_title, "").trim().to_string();
|
|
||||||
|
|
||||||
let primary_display_price = parser::parse_price(&price_text);
|
|
||||||
|
|
||||||
let mut current_bid_price: Option<f64> = None;
|
|
||||||
let mut final_buy_it_now_price: Option<f64> = None;
|
|
||||||
let mut has_best_offer = false;
|
|
||||||
let mut item_is_auction = false;
|
|
||||||
|
|
||||||
if let Some(bid_el) = element.select(&bid_count_selector).next() {
|
|
||||||
if bid_el.text().collect::<String>().to_lowercase().contains("bid") {
|
|
||||||
item_is_auction = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if element.select(&best_offer_selector).next().is_some() {
|
|
||||||
has_best_offer = true;
|
|
||||||
} else {
|
|
||||||
for el in element.select(&secondary_info_selector) {
|
|
||||||
if el.text().collect::<String>().to_lowercase().contains("or best offer") {
|
|
||||||
has_best_offer = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if item_is_auction {
|
|
||||||
current_bid_price = primary_display_price;
|
|
||||||
if let Some(bin_el) = element.select(&auction_bin_price_selector).next() {
|
|
||||||
final_buy_it_now_price = parser::parse_price(&bin_el.text().collect::<String>());
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
final_buy_it_now_price = primary_display_price;
|
|
||||||
}
|
|
||||||
|
|
||||||
let image_url_val = element.select(&image_selector).next()
|
|
||||||
.and_then(|img_el| {
|
|
||||||
img_el.value().attr("data-src").or(img_el.value().attr("src"))
|
|
||||||
})
|
|
||||||
.map(|s| s.to_string());
|
|
||||||
|
|
||||||
let parsed_size_info = parser::parse_size_and_quantity(&cleaned_title);
|
|
||||||
|
|
||||||
let cost_per_tb = if let Some(price) = primary_display_price {
|
|
||||||
if parsed_size_info.total_tb > 0.0 {
|
|
||||||
Some(((price / parsed_size_info.total_tb) * 100.0).round() / 100.0)
|
|
||||||
} else { None }
|
|
||||||
} else { None };
|
|
||||||
|
|
||||||
let parsed_data = ParsedItemData {
|
|
||||||
item_count: parsed_size_info.quantity,
|
|
||||||
size_per_item_tb: if parsed_size_info.individual_size_tb > 0.0 { Some(parsed_size_info.individual_size_tb) } else { None },
|
|
||||||
total_tb: if parsed_size_info.total_tb > 0.0 { Some(parsed_size_info.total_tb) } else { None },
|
|
||||||
cost_per_tb, // This will be None if conditions aren't met, and serialized as null
|
|
||||||
needed_description_check: parsed_size_info.needed_description_check,
|
|
||||||
parser_engine: PARSER_ENGINE_VERSION,
|
|
||||||
};
|
|
||||||
|
|
||||||
items.push(EbayItem {
|
|
||||||
title: cleaned_title,
|
|
||||||
item_id,
|
|
||||||
date_found: today,
|
|
||||||
current_bid_price,
|
|
||||||
buy_it_now_price: final_buy_it_now_price,
|
|
||||||
has_best_offer,
|
|
||||||
image_url: image_url_val,
|
|
||||||
parsed: parsed_data,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
Ok(items)
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Downloads an image from a URL and saves it, preserving path structure.
|
|
||||||
pub async fn download_image(image_url_str: &str, base_save_directory: &Path, quiet_mode: bool) -> Result<(), Box<dyn Error>> {
|
|
||||||
if image_url_str.is_empty() {
|
|
||||||
return Ok(());
|
|
||||||
}
|
|
||||||
|
|
||||||
let parsed_url = Url::parse(image_url_str)?;
|
|
||||||
|
|
||||||
let image_path_from_url = parsed_url.path().trim_start_matches('/');
|
|
||||||
if image_path_from_url.is_empty() {
|
|
||||||
return Err("Image URL has no path component".into());
|
|
||||||
}
|
|
||||||
|
|
||||||
let full_local_image_path = base_save_directory.join(image_path_from_url);
|
|
||||||
|
|
||||||
if let Some(parent_dir) = full_local_image_path.parent() {
|
|
||||||
fs::create_dir_all(parent_dir)?;
|
|
||||||
log_message(&format!("Ensured image directory exists: {}", parent_dir.display()), quiet_mode);
|
|
||||||
}
|
|
||||||
|
|
||||||
let client = reqwest::Client::builder().user_agent(USER_AGENT).build()?;
|
|
||||||
let response = client.get(image_url_str).send().await?;
|
|
||||||
|
|
||||||
if !response.status().is_success() {
|
|
||||||
return Err(format!("Failed to download image {}. Status: {}", image_url_str, response.status()).into());
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut file = File::create(&full_local_image_path)?;
|
|
||||||
let content = response.bytes().await?;
|
|
||||||
file.write_all(&content)?;
|
|
||||||
|
|
||||||
log_message(&format!("Downloaded image: {}", full_local_image_path.display()), quiet_mode);
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// --- Main Application Logic ---
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> Result<(), Box<dyn Error>> {
|
async fn main() -> Result<(), AppError> {
|
||||||
let cli = Cli::parse();
|
let cli_args = Cli::parse();
|
||||||
let quiet_mode = cli.only_json;
|
|
||||||
|
|
||||||
log_message("Starting scraping process...", quiet_mode);
|
let subscriber_builder = tracing_subscriber::fmt().with_writer(std::io::stderr);
|
||||||
|
if cli_args.only_json {
|
||||||
|
subscriber_builder
|
||||||
|
.with_max_level(Level::ERROR)
|
||||||
|
.try_init()?;
|
||||||
|
} else {
|
||||||
|
subscriber_builder
|
||||||
|
.with_max_level(Level::INFO)
|
||||||
|
.try_init()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
info!("Starting scraping process...");
|
||||||
|
|
||||||
let html_content_to_parse: String;
|
let html_content_to_parse: String;
|
||||||
let mut should_download_images = false;
|
let mut should_download_images = false;
|
||||||
let mut image_base_save_dir: Option<PathBuf> = None;
|
let mut image_base_save_dir: Option<PathBuf> = None;
|
||||||
|
|
||||||
|
if let Some(html_file) = &cli_args.load {
|
||||||
if let Some(html_file) = &cli.load {
|
info!(file_path = %html_file, "Loading HTML from file");
|
||||||
log_message(&format!("Loading HTML from {}...", html_file), quiet_mode);
|
html_content_to_parse = fs::read_to_string(html_file)?; // std::io::Error converts to AppError via ?
|
||||||
html_content_to_parse = fs::read_to_string(html_file)?;
|
info!("HTML loaded. Network requests for page content disabled.");
|
||||||
log_message("HTML loaded. Network requests for page content disabled.", quiet_mode);
|
|
||||||
} else {
|
} else {
|
||||||
let url_to_fetch = match (&cli.command, &cli.url) {
|
let url_to_fetch = match (&cli_args.command, &cli_args.url) {
|
||||||
(Some(Commands::Latest(latest_args)), _) => {
|
(Some(Commands::Latest(latest_args)), _) => {
|
||||||
let valid_per_page = ["60", "120", "240"];
|
let valid_per_page = ["60", "120", "240"];
|
||||||
if !valid_per_page.contains(&latest_args.per_page.as_str()) {
|
if !valid_per_page.contains(&latest_args.per_page.as_str()) {
|
||||||
let err_msg = format!("--per_page must be one of {}, got {}", valid_per_page.join(", "), latest_args.per_page);
|
let err_msg = format!("--per_page must be one of {}, got {}", valid_per_page.join(", "), latest_args.per_page);
|
||||||
log_error(&err_msg, quiet_mode);
|
error!(error_message = %err_msg, "Invalid per_page argument");
|
||||||
return Err(err_msg.into());
|
return Err(err_msg.into()); // String converts to AppError
|
||||||
}
|
}
|
||||||
if latest_args.minimum_cost < 0.0 {
|
if latest_args.minimum_cost < 0.0 {
|
||||||
let err_msg = "--minimum_cost must be a non-negative number.";
|
let err_msg = "--minimum_cost must be a non-negative number.";
|
||||||
log_error(err_msg, quiet_mode);
|
error!(error_message = %err_msg, "Invalid minimum_cost argument");
|
||||||
return Err(err_msg.into());
|
return Err(err_msg.into()); // String converts to AppError
|
||||||
}
|
}
|
||||||
let base_url = "https://www.ebay.com/sch/i.html?_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10";
|
let base_url = "https://www.ebay.com/sch/i.html?_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10";
|
||||||
let url = format!("{}&_ipg={}&_udlo={:.2}", base_url, latest_args.per_page, latest_args.minimum_cost);
|
let url = format!("{}&_ipg={}&_udlo={:.2}", base_url, latest_args.per_page, latest_args.minimum_cost);
|
||||||
log_message(&format!("Constructed URL for 'latest': {}", url), quiet_mode);
|
info!(constructed_url = %url, "Constructed URL for 'latest' command");
|
||||||
url
|
url
|
||||||
}
|
}
|
||||||
(None, Some(url_arg)) => {
|
(None, Some(url_arg)) => {
|
||||||
@ -428,27 +68,18 @@ async fn main() -> Result<(), Box<dyn Error>> {
|
|||||||
}
|
}
|
||||||
(None, None) => {
|
(None, None) => {
|
||||||
let err_msg = "No URL provided and no command specified. Use --help for usage.";
|
let err_msg = "No URL provided and no command specified. Use --help for usage.";
|
||||||
log_error(err_msg, true);
|
eprintln!("ERROR: {}", err_msg);
|
||||||
return Err(err_msg.into());
|
return Err(err_msg.into()); // String converts to AppError
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
log_message(&format!("Navigating to {}...", url_to_fetch), quiet_mode);
|
html_content_to_parse = html_utils::fetch_html(&url_to_fetch).await?; // This now returns Result<String, AppError>
|
||||||
let client = reqwest::Client::builder().user_agent(USER_AGENT).build()?;
|
|
||||||
let response = client.get(&url_to_fetch).send().await?;
|
|
||||||
if !response.status().is_success() {
|
|
||||||
let err_msg = format!("Failed to fetch URL: {} - Status: {}", url_to_fetch, response.status());
|
|
||||||
log_error(&err_msg, quiet_mode);
|
|
||||||
return Err(err_msg.into());
|
|
||||||
}
|
|
||||||
html_content_to_parse = response.text().await?;
|
|
||||||
log_message("Navigation successful. Page content retrieved.", quiet_mode);
|
|
||||||
|
|
||||||
if let Some(save_path_str) = &cli.save {
|
if let Some(save_path_str) = &cli_args.save {
|
||||||
log_message(&format!("Saving HTML to {}...", save_path_str), quiet_mode);
|
info!(file_path = %save_path_str, "Saving HTML to file");
|
||||||
let mut file = File::create(save_path_str)?;
|
let mut file = fs::File::create(save_path_str)?; // std::io::Error converts
|
||||||
file.write_all(html_content_to_parse.as_bytes())?;
|
file.write_all(html_content_to_parse.as_bytes())?; // std::io::Error converts
|
||||||
log_message("HTML saved.", quiet_mode);
|
info!("HTML saved.");
|
||||||
|
|
||||||
should_download_images = true;
|
should_download_images = true;
|
||||||
let save_file_path = PathBuf::from(save_path_str);
|
let save_file_path = PathBuf::from(save_path_str);
|
||||||
@ -461,13 +92,13 @@ async fn main() -> Result<(), Box<dyn Error>> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
log_message("Extracting data...", quiet_mode);
|
info!("Extracting data...");
|
||||||
let extracted_results = html_scraper::extract_data_from_html(&html_content_to_parse, quiet_mode)?;
|
let extracted_results: Vec<EbayItem> = html_utils::extract_data_from_html(&html_content_to_parse)?; // This now returns Result<Vec<EbayItem>, AppError>
|
||||||
log_message(&format!("Data extraction complete. Found {} items.", extracted_results.len()), quiet_mode);
|
info!(item_count = extracted_results.len(), "Data extraction complete.");
|
||||||
|
|
||||||
if should_download_images && !extracted_results.is_empty() {
|
if should_download_images && !extracted_results.is_empty() {
|
||||||
if let Some(img_base_dir) = image_base_save_dir {
|
if let Some(img_base_dir) = image_base_save_dir {
|
||||||
log_message(&format!("Downloading images into subdirectories of {}...", img_base_dir.display()), quiet_mode);
|
info!(directory = %img_base_dir.display(), "Downloading images");
|
||||||
|
|
||||||
let mut download_futures = Vec::new();
|
let mut download_futures = Vec::new();
|
||||||
for item in &extracted_results {
|
for item in &extracted_results {
|
||||||
@ -477,31 +108,30 @@ async fn main() -> Result<(), Box<dyn Error>> {
|
|||||||
let item_id_clone = item.item_id.clone();
|
let item_id_clone = item.item_id.clone();
|
||||||
|
|
||||||
download_futures.push(async move {
|
download_futures.push(async move {
|
||||||
if let Err(e) = html_scraper::download_image(&img_url_clone, &img_base_dir_clone, quiet_mode).await {
|
if let Err(e) = html_utils::download_image(&img_url_clone, &img_base_dir_clone).await { // This now returns Result<(), AppError>
|
||||||
log_error(&format!("Skipping image download for item ID {} (URL: {}) due to error: {}", item_id_clone, img_url_clone, e), quiet_mode);
|
warn!(item_id = %item_id_clone, image_url = %img_url_clone, error = %e, "Skipping image download due to error");
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
futures::future::join_all(download_futures).await;
|
futures::future::join_all(download_futures).await;
|
||||||
log_message("Image download process finished.", quiet_mode);
|
info!("Image download process finished.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if quiet_mode {
|
if cli_args.only_json {
|
||||||
println!("{}", serde_json::to_string(&extracted_results)?);
|
println!("{}", serde_json::to_string(&extracted_results)?); // serde_json::Error converts
|
||||||
} else {
|
} else {
|
||||||
println!("{}", serde_json::to_string_pretty(&extracted_results)?);
|
println!("{}", serde_json::to_string_pretty(&extracted_results)?); // serde_json::Error converts
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
// --- Unit tests for parser functions (optional, but good practice) ---
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::parser::*;
|
use super::parser_utils::*;
|
||||||
use super::SizeQuantityInfo;
|
use super::item::SizeQuantityInfo;
|
||||||
|
|
||||||
fn assert_sq_info_eq(actual: SizeQuantityInfo, expected_total_tb: f64, expected_quantity: i32, expected_ind_size_tb: f64, expected_check: bool) {
|
fn assert_sq_info_eq(actual: SizeQuantityInfo, expected_total_tb: f64, expected_quantity: i32, expected_ind_size_tb: f64, expected_check: bool) {
|
||||||
assert!((actual.total_tb - expected_total_tb).abs() < 0.0001, "TotalTB mismatch. Expected: {}, Got: {}", expected_total_tb, actual.total_tb);
|
assert!((actual.total_tb - expected_total_tb).abs() < 0.0001, "TotalTB mismatch. Expected: {}, Got: {}", expected_total_tb, actual.total_tb);
|
||||||
@ -527,7 +157,7 @@ mod tests {
|
|||||||
];
|
];
|
||||||
|
|
||||||
for (title, total_tb, quantity, ind_size_tb, check) in test_cases {
|
for (title, total_tb, quantity, ind_size_tb, check) in test_cases {
|
||||||
println!("Testing title: {}", title);
|
tracing::debug!(testing_title = %title, "Running test_parse_size_and_quantity");
|
||||||
let result = parse_size_and_quantity(title);
|
let result = parse_size_and_quantity(title);
|
||||||
assert_sq_info_eq(result, total_tb, quantity, ind_size_tb, check);
|
assert_sq_info_eq(result, total_tb, quantity, ind_size_tb, check);
|
||||||
}
|
}
|
||||||
@ -544,4 +174,3 @@ mod tests {
|
|||||||
assert_eq!(parse_price("25.50"), Some(25.50));
|
assert_eq!(parse_price("25.50"), Some(25.50));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
122
ebay_storage/rust/src/parser_utils.rs
Normal file
122
ebay_storage/rust/src/parser_utils.rs
Normal file
@ -0,0 +1,122 @@
|
|||||||
|
// src/parser_utils.rs
|
||||||
|
use super::item::SizeQuantityInfo;
|
||||||
|
use lazy_static::lazy_static;
|
||||||
|
use regex::Regex; // Assuming item.rs is in the same directory (src)
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref EXPLICIT_QTY_PATTERNS: Vec<Regex> = vec![
|
||||||
|
Regex::new(r"\b(?:LOT\s+OF|LOT)\s*\(?\s*(\d+)\s*\)?").unwrap(),
|
||||||
|
Regex::new(r"\b(?:LOT\s+OF|LOT)\s*\*\s*(\d+)").unwrap(),
|
||||||
|
Regex::new(r"\b(?:PACK\s+OF|PACK|BULK)\s*\(?\s*(\d+)\s*\)?").unwrap(),
|
||||||
|
Regex::new(r"\b(\d+)\s*-\s*PACK\b").unwrap(),
|
||||||
|
Regex::new(r"\b(\d+)\s*COUNT\b").unwrap(),
|
||||||
|
];
|
||||||
|
static ref SIZE_REGEX: Regex = Regex::new(r"(\d+(?:\.\d+)?)\s*(TB|GB)\b").unwrap();
|
||||||
|
static ref SIZE_RANGE_REGEX: Regex =
|
||||||
|
Regex::new(r"\d+(?:\.\d+)?\s*(?:GB|TB)\s*(?:-|&|OR|TO)\s*\d+(?:\.\d+)?\s*(?:GB|TB)")
|
||||||
|
.unwrap();
|
||||||
|
static ref PRICE_REGEX: Regex = Regex::new(r"\$?([\d,]+\.?\d*)").unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parses size and quantity information from an item title.
|
||||||
|
pub fn parse_size_and_quantity(title: &str) -> SizeQuantityInfo {
|
||||||
|
let upper_title = title.to_uppercase();
|
||||||
|
let mut total_tb = 0.0;
|
||||||
|
let mut quantity = 1;
|
||||||
|
let mut needed_description_check = false;
|
||||||
|
let mut individual_size_tb = 0.0;
|
||||||
|
|
||||||
|
for pattern in EXPLICIT_QTY_PATTERNS.iter() {
|
||||||
|
if let Some(caps) = pattern.captures(&upper_title) {
|
||||||
|
if let Some(qty_match) = caps.get(1) {
|
||||||
|
if let Ok(parsed_qty) = qty_match.as_str().parse::<i32>() {
|
||||||
|
if parsed_qty > 0 && parsed_qty < 500 {
|
||||||
|
quantity = parsed_qty;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut size_matches: Vec<(f64, String)> = Vec::new();
|
||||||
|
for caps in SIZE_REGEX.captures_iter(&upper_title) {
|
||||||
|
if let (Some(val_str), Some(unit_str)) = (caps.get(1), caps.get(2)) {
|
||||||
|
if let Ok(val) = val_str.as_str().parse::<f64>() {
|
||||||
|
size_matches.push((val, unit_str.as_str().to_string()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !size_matches.is_empty() {
|
||||||
|
let mut unique_sizes_tb: Vec<f64> = size_matches
|
||||||
|
.iter()
|
||||||
|
.map(|(val, unit)| if unit == "GB" { *val / 1000.0 } else { *val })
|
||||||
|
.collect();
|
||||||
|
unique_sizes_tb.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
|
||||||
|
unique_sizes_tb.dedup();
|
||||||
|
|
||||||
|
if !unique_sizes_tb.is_empty() {
|
||||||
|
individual_size_tb = unique_sizes_tb[0];
|
||||||
|
if unique_sizes_tb.len() > 1 {
|
||||||
|
needed_description_check = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if SIZE_RANGE_REGEX.is_match(&upper_title) {
|
||||||
|
needed_description_check = true;
|
||||||
|
}
|
||||||
|
if quantity > 1 && upper_title.contains("MIXED") {
|
||||||
|
needed_description_check = true;
|
||||||
|
}
|
||||||
|
if upper_title.contains("CHECK THE DESCRIPTION")
|
||||||
|
|| upper_title.contains("CHECK DESCRIPTION")
|
||||||
|
|| upper_title.contains("SEE DESCRIPTION")
|
||||||
|
{
|
||||||
|
if quantity > 1 || size_matches.is_empty() || size_matches.len() > 1 {
|
||||||
|
needed_description_check = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if individual_size_tb > 0.0 {
|
||||||
|
total_tb = individual_size_tb * quantity as f64;
|
||||||
|
}
|
||||||
|
|
||||||
|
if quantity > 1 && total_tb == 0.0 && !size_matches.is_empty() {
|
||||||
|
needed_description_check = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if quantity == 1 && size_matches.len() == 1 && !needed_description_check {
|
||||||
|
// This condition is implicitly handled
|
||||||
|
}
|
||||||
|
|
||||||
|
SizeQuantityInfo {
|
||||||
|
total_tb: (total_tb * 10000.0).round() / 10000.0,
|
||||||
|
quantity,
|
||||||
|
individual_size_tb: (individual_size_tb * 10000.0).round() / 10000.0,
|
||||||
|
needed_description_check,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parses price from a string, taking the first price if it's a range.
|
||||||
|
pub fn parse_price(price_text: &str) -> Option<f64> {
|
||||||
|
let lower_price_text = price_text.to_lowercase();
|
||||||
|
if lower_price_text.contains(" to ") {
|
||||||
|
if let Some(first_part) = lower_price_text.split(" to ").next() {
|
||||||
|
if let Some(caps) = PRICE_REGEX.captures(first_part) {
|
||||||
|
if let Some(price_match) = caps.get(1) {
|
||||||
|
return price_match.as_str().replace(',', "").parse().ok();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(caps) = PRICE_REGEX.captures(price_text) {
|
||||||
|
if let Some(price_match) = caps.get(1) {
|
||||||
|
return price_match.as_str().replace(',', "").parse().ok();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
Reference in New Issue
Block a user