// main.rs // Import necessary crates use clap::Parser; use regex::Regex; use scraper::{Html, Selector}; use serde::Serialize; use std::fs::{self, File}; use std::io::Write; use std::path::{Path, PathBuf}; use std::error::Error; use chrono::{DateTime, Utc}; use lazy_static::lazy_static; use url::Url; // Define constants const PARSER_ENGINE_VERSION: i32 = 1; const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36"; // --- Lazy static Regex definitions --- lazy_static! { // Regex for parsing quantity from title (e.g., "LOT OF 10", "5-PACK") static ref EXPLICIT_QTY_PATTERNS: Vec = vec![ Regex::new(r"\b(?:LOT\s+OF|LOT)\s*\(?\s*(\d+)\s*\)?").unwrap(), Regex::new(r"\b(?:LOT\s+OF|LOT)\s*\*\s*(\d+)").unwrap(), Regex::new(r"\b(?:PACK\s+OF|PACK|BULK)\s*\(?\s*(\d+)\s*\)?").unwrap(), Regex::new(r"\b(\d+)\s*-\s*PACK\b").unwrap(), Regex::new(r"\b(\d+)\s*COUNT\b").unwrap(), ]; // Regex for parsing size from title (e.g., "500GB", "2TB") static ref SIZE_REGEX: Regex = Regex::new(r"(\d+(?:\.\d+)?)\s*(TB|GB)\b").unwrap(); // Regex for titles indicating a range of sizes or mixed items static ref SIZE_RANGE_REGEX: Regex = Regex::new(r"\d+(?:\.\d+)?\s*(?:GB|TB)\s*(?:-|&|OR|TO)\s*\d+(?:\.\d+)?\s*(?:GB|TB)").unwrap(); // Regex for extracting item ID from URL static ref ITEM_ID_REGEX: Regex = Regex::new(r"/itm/(\d+)").unwrap(); // Regex for parsing price, potentially a range static ref PRICE_REGEX: Regex = Regex::new(r"\$?([\d,]+\.?\d*)").unwrap(); // Regex for "NEW LISTING" prefix - case-insensitive to better match JS /i flag static ref NEW_LISTING_REGEX: Regex = Regex::new(r"(?i)^\s*NEW LISTING\s*[:\-\s]*").unwrap(); } // --- Command Line Argument Parsing (using clap) --- #[derive(Parser, Debug)] #[clap(name = "ebay-scraper-rust", version = "0.1.0", about = "Scrapes eBay search results for SSD/HDD cost per TB.")] struct Cli { #[clap(subcommand)] command: Option, /// The full eBay search URL to scrape. url: Option, /// Save scraped HTML to a file (and download images if fetching from URL). #[clap(long)] save: Option, /// Load HTML from a file (disables network). Image download will not occur with --load. #[clap(long)] load: Option, /// Suppress informational logs, output only final JSON. #[clap(long)] only_json: bool, } #[derive(Parser, Debug)] enum Commands { /// Scrapes latest listings. Latest(LatestArgs), } #[derive(Parser, Debug)] struct LatestArgs { /// Items per page (60, 120, or 240) #[clap(long, default_value = "60")] per_page: String, // Keep as string for validation /// Minimum cost (e.g., 50.00) #[clap(long, default_value = "0.00")] minimum_cost: f64, } // --- Data Structures for Scraped Items (using serde) --- #[derive(Serialize, Debug)] struct EbayItem { title: String, #[serde(rename = "itemId")] item_id: String, #[serde(rename = "dateFound")] date_found: DateTime, #[serde(rename = "currentBidPrice")] current_bid_price: Option, #[serde(rename = "buyItNowPrice", skip_serializing_if = "Option::is_none")] // Keep skip for this one if JS does it buy_it_now_price: Option, #[serde(rename = "hasBestOffer")] has_best_offer: bool, #[serde(skip_serializing_if = "Option::is_none")] // Keep skip for this one if JS does it image_url: Option, parsed: ParsedItemData, } #[derive(Serialize, Debug)] struct ParsedItemData { #[serde(rename = "itemCount")] item_count: i32, // MODIFIED: Removed skip_serializing_if to always include the field, even if null #[serde(rename = "sizePerItemTB")] size_per_item_tb: Option, #[serde(rename = "totalTB")] total_tb: Option, #[serde(rename = "costPerTB")] cost_per_tb: Option, #[serde(rename = "needed_description_check")] needed_description_check: bool, #[serde(rename = "parser_engine")] parser_engine: i32, } #[derive(Debug)] struct SizeQuantityInfo { total_tb: f64, quantity: i32, individual_size_tb: f64, needed_description_check: bool, } // --- Logging --- fn log_message(message: &str, quiet_mode: bool) { if !quiet_mode { eprintln!("{}", message); } } fn log_error(message: &str, quiet_mode: bool) { if !quiet_mode { eprintln!("ERROR: {}", message); } } // --- Parsing Logic --- mod parser { use super::*; /// Parses size and quantity information from an item title. pub fn parse_size_and_quantity(title: &str) -> SizeQuantityInfo { let upper_title = title.to_uppercase(); let mut total_tb = 0.0; let mut quantity = 1; let mut needed_description_check = false; let mut individual_size_tb = 0.0; for pattern in EXPLICIT_QTY_PATTERNS.iter() { if let Some(caps) = pattern.captures(&upper_title) { if let Some(qty_match) = caps.get(1) { if let Ok(parsed_qty) = qty_match.as_str().parse::() { if parsed_qty > 0 && parsed_qty < 500 { quantity = parsed_qty; break; } } } } } let mut size_matches: Vec<(f64, String)> = Vec::new(); for caps in SIZE_REGEX.captures_iter(&upper_title) { if let (Some(val_str), Some(unit_str)) = (caps.get(1), caps.get(2)) { if let Ok(val) = val_str.as_str().parse::() { size_matches.push((val, unit_str.as_str().to_string())); } } } if !size_matches.is_empty() { let mut unique_sizes_tb: Vec = size_matches.iter() .map(|(val, unit)| if unit == "GB" { *val / 1000.0 } else { *val }) .collect(); unique_sizes_tb.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); unique_sizes_tb.dedup(); if !unique_sizes_tb.is_empty() { individual_size_tb = unique_sizes_tb[0]; if unique_sizes_tb.len() > 1 { needed_description_check = true; } } } if SIZE_RANGE_REGEX.is_match(&upper_title) { needed_description_check = true; } if quantity > 1 && upper_title.contains("MIXED") { needed_description_check = true; } if upper_title.contains("CHECK THE DESCRIPTION") || upper_title.contains("CHECK DESCRIPTION") || upper_title.contains("SEE DESCRIPTION") { if quantity > 1 || size_matches.is_empty() || size_matches.len() > 1 { needed_description_check = true; } } if individual_size_tb > 0.0 { total_tb = individual_size_tb * quantity as f64; } if quantity > 1 && total_tb == 0.0 && !size_matches.is_empty() { needed_description_check = true; } if quantity == 1 && size_matches.len() == 1 && !needed_description_check { // This condition is implicitly handled } SizeQuantityInfo { total_tb: (total_tb * 10000.0).round() / 10000.0, quantity, individual_size_tb: (individual_size_tb * 10000.0).round() / 10000.0, needed_description_check, } } /// Parses price from a string, taking the first price if it's a range. pub fn parse_price(price_text: &str) -> Option { let lower_price_text = price_text.to_lowercase(); if lower_price_text.contains(" to ") { if let Some(first_part) = lower_price_text.split(" to ").next() { if let Some(caps) = PRICE_REGEX.captures(first_part) { if let Some(price_match) = caps.get(1) { return price_match.as_str().replace(',', "").parse().ok(); } } } return None; } if let Some(caps) = PRICE_REGEX.captures(price_text) { if let Some(price_match) = caps.get(1) { return price_match.as_str().replace(',', "").parse().ok(); } } None } } // --- HTML Scraping Logic --- mod html_scraper { use super::*; /// Extracts item data from HTML content. pub fn extract_data_from_html(html_content: &str, quiet_mode: bool) -> Result, Box> { let document = Html::parse_document(html_content); let mut items = Vec::new(); let today = Utc::now(); let item_selector = Selector::parse("li.s-item, li.srp-results__item, div.s-item[role='listitem']").unwrap(); let title_selector = Selector::parse(".s-item__title, .srp-results__title").unwrap(); let price_selector = Selector::parse(".s-item__price").unwrap(); let image_selector = Selector::parse(".s-item__image-wrapper img.s-item__image-img, .s-item__image img").unwrap(); let link_selector = Selector::parse("a.s-item__link[href*='/itm/'], .s-item__info > a[href*='/itm/']").unwrap(); let bid_count_selector = Selector::parse(".s-item__bid-count").unwrap(); let best_offer_selector = Selector::parse(".s-item__purchase-options--bo, .s-item__best-offer").unwrap(); let secondary_info_selector = Selector::parse(".s-item__subtitle, .s-item__secondary-text, .s-item__detail--secondary").unwrap(); let auction_bin_price_selector = Selector::parse(".s-item__buy-it-now-price").unwrap(); for element in document.select(&item_selector) { let raw_title_text = element.select(&title_selector).next().map(|el| el.text().collect::().trim().to_string()); let price_text = element.select(&price_selector).next().map(|el| el.text().collect::().trim().to_string()); let item_id = element.select(&link_selector).next() .and_then(|link_el| link_el.value().attr("href")) .and_then(|href| ITEM_ID_REGEX.captures(href)) .and_then(|caps| caps.get(1).map(|m| m.as_str().to_string())); if raw_title_text.is_none() || price_text.is_none() || item_id.is_none() { log_message("Skipping item due to missing title, price, or item ID.", quiet_mode); continue; } let raw_title = raw_title_text.unwrap(); let price_text = price_text.unwrap(); let item_id = item_id.unwrap(); let cleaned_title = NEW_LISTING_REGEX.replace(&raw_title, "").trim().to_string(); let primary_display_price = parser::parse_price(&price_text); let mut current_bid_price: Option = None; let mut final_buy_it_now_price: Option = None; let mut has_best_offer = false; let mut item_is_auction = false; if let Some(bid_el) = element.select(&bid_count_selector).next() { if bid_el.text().collect::().to_lowercase().contains("bid") { item_is_auction = true; } } if element.select(&best_offer_selector).next().is_some() { has_best_offer = true; } else { for el in element.select(&secondary_info_selector) { if el.text().collect::().to_lowercase().contains("or best offer") { has_best_offer = true; break; } } } if item_is_auction { current_bid_price = primary_display_price; if let Some(bin_el) = element.select(&auction_bin_price_selector).next() { final_buy_it_now_price = parser::parse_price(&bin_el.text().collect::()); } } else { final_buy_it_now_price = primary_display_price; } let image_url_val = element.select(&image_selector).next() .and_then(|img_el| { img_el.value().attr("data-src").or(img_el.value().attr("src")) }) .map(|s| s.to_string()); let parsed_size_info = parser::parse_size_and_quantity(&cleaned_title); let cost_per_tb = if let Some(price) = primary_display_price { if parsed_size_info.total_tb > 0.0 { Some(((price / parsed_size_info.total_tb) * 100.0).round() / 100.0) } else { None } } else { None }; let parsed_data = ParsedItemData { item_count: parsed_size_info.quantity, size_per_item_tb: if parsed_size_info.individual_size_tb > 0.0 { Some(parsed_size_info.individual_size_tb) } else { None }, total_tb: if parsed_size_info.total_tb > 0.0 { Some(parsed_size_info.total_tb) } else { None }, cost_per_tb, // This will be None if conditions aren't met, and serialized as null needed_description_check: parsed_size_info.needed_description_check, parser_engine: PARSER_ENGINE_VERSION, }; items.push(EbayItem { title: cleaned_title, item_id, date_found: today, current_bid_price, buy_it_now_price: final_buy_it_now_price, has_best_offer, image_url: image_url_val, parsed: parsed_data, }); } Ok(items) } /// Downloads an image from a URL and saves it, preserving path structure. pub async fn download_image(image_url_str: &str, base_save_directory: &Path, quiet_mode: bool) -> Result<(), Box> { if image_url_str.is_empty() { return Ok(()); } let parsed_url = Url::parse(image_url_str)?; let image_path_from_url = parsed_url.path().trim_start_matches('/'); if image_path_from_url.is_empty() { return Err("Image URL has no path component".into()); } let full_local_image_path = base_save_directory.join(image_path_from_url); if let Some(parent_dir) = full_local_image_path.parent() { fs::create_dir_all(parent_dir)?; log_message(&format!("Ensured image directory exists: {}", parent_dir.display()), quiet_mode); } let client = reqwest::Client::builder().user_agent(USER_AGENT).build()?; let response = client.get(image_url_str).send().await?; if !response.status().is_success() { return Err(format!("Failed to download image {}. Status: {}", image_url_str, response.status()).into()); } let mut file = File::create(&full_local_image_path)?; let content = response.bytes().await?; file.write_all(&content)?; log_message(&format!("Downloaded image: {}", full_local_image_path.display()), quiet_mode); Ok(()) } } // --- Main Application Logic --- #[tokio::main] async fn main() -> Result<(), Box> { let cli = Cli::parse(); let quiet_mode = cli.only_json; log_message("Starting scraping process...", quiet_mode); let html_content_to_parse: String; let mut should_download_images = false; let mut image_base_save_dir: Option = None; if let Some(html_file) = &cli.load { log_message(&format!("Loading HTML from {}...", html_file), quiet_mode); html_content_to_parse = fs::read_to_string(html_file)?; log_message("HTML loaded. Network requests for page content disabled.", quiet_mode); } else { let url_to_fetch = match (&cli.command, &cli.url) { (Some(Commands::Latest(latest_args)), _) => { let valid_per_page = ["60", "120", "240"]; if !valid_per_page.contains(&latest_args.per_page.as_str()) { let err_msg = format!("--per_page must be one of {}, got {}", valid_per_page.join(", "), latest_args.per_page); log_error(&err_msg, quiet_mode); return Err(err_msg.into()); } if latest_args.minimum_cost < 0.0 { let err_msg = "--minimum_cost must be a non-negative number."; log_error(err_msg, quiet_mode); return Err(err_msg.into()); } let base_url = "https://www.ebay.com/sch/i.html?_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10"; let url = format!("{}&_ipg={}&_udlo={:.2}", base_url, latest_args.per_page, latest_args.minimum_cost); log_message(&format!("Constructed URL for 'latest': {}", url), quiet_mode); url } (None, Some(url_arg)) => { url_arg.clone() } (None, None) => { let err_msg = "No URL provided and no command specified. Use --help for usage."; log_error(err_msg, true); return Err(err_msg.into()); } }; log_message(&format!("Navigating to {}...", url_to_fetch), quiet_mode); let client = reqwest::Client::builder().user_agent(USER_AGENT).build()?; let response = client.get(&url_to_fetch).send().await?; if !response.status().is_success() { let err_msg = format!("Failed to fetch URL: {} - Status: {}", url_to_fetch, response.status()); log_error(&err_msg, quiet_mode); return Err(err_msg.into()); } html_content_to_parse = response.text().await?; log_message("Navigation successful. Page content retrieved.", quiet_mode); if let Some(save_path_str) = &cli.save { log_message(&format!("Saving HTML to {}...", save_path_str), quiet_mode); let mut file = File::create(save_path_str)?; file.write_all(html_content_to_parse.as_bytes())?; log_message("HTML saved.", quiet_mode); should_download_images = true; let save_file_path = PathBuf::from(save_path_str); let base_name = save_file_path.file_stem().unwrap_or_default().to_string_lossy().to_string(); if let Some(parent_dir) = save_file_path.parent() { image_base_save_dir = Some(parent_dir.join(base_name)); } else { image_base_save_dir = Some(PathBuf::from(base_name)); } } } log_message("Extracting data...", quiet_mode); let extracted_results = html_scraper::extract_data_from_html(&html_content_to_parse, quiet_mode)?; log_message(&format!("Data extraction complete. Found {} items.", extracted_results.len()), quiet_mode); if should_download_images && !extracted_results.is_empty() { if let Some(img_base_dir) = image_base_save_dir { log_message(&format!("Downloading images into subdirectories of {}...", img_base_dir.display()), quiet_mode); let mut download_futures = Vec::new(); for item in &extracted_results { if let Some(img_url) = &item.image_url { let img_base_dir_clone = img_base_dir.clone(); let img_url_clone = img_url.clone(); let item_id_clone = item.item_id.clone(); download_futures.push(async move { if let Err(e) = html_scraper::download_image(&img_url_clone, &img_base_dir_clone, quiet_mode).await { log_error(&format!("Skipping image download for item ID {} (URL: {}) due to error: {}", item_id_clone, img_url_clone, e), quiet_mode); } }); } } futures::future::join_all(download_futures).await; log_message("Image download process finished.", quiet_mode); } } if quiet_mode { println!("{}", serde_json::to_string(&extracted_results)?); } else { println!("{}", serde_json::to_string_pretty(&extracted_results)?); } Ok(()) } // --- Unit tests for parser functions (optional, but good practice) --- #[cfg(test)] mod tests { use super::parser::*; use super::SizeQuantityInfo; fn assert_sq_info_eq(actual: SizeQuantityInfo, expected_total_tb: f64, expected_quantity: i32, expected_ind_size_tb: f64, expected_check: bool) { assert!((actual.total_tb - expected_total_tb).abs() < 0.0001, "TotalTB mismatch. Expected: {}, Got: {}", expected_total_tb, actual.total_tb); assert_eq!(actual.quantity, expected_quantity, "Quantity mismatch"); assert!((actual.individual_size_tb - expected_ind_size_tb).abs() < 0.0001, "IndividualSizeTB mismatch. Expected: {}, Got: {}", expected_ind_size_tb, actual.individual_size_tb); assert_eq!(actual.needed_description_check, expected_check, "NeededDescriptionCheck mismatch"); } #[test] fn test_parse_size_and_quantity() { let test_cases = vec![ ("LOT OF (9) MAJOR BRAND 2.5\" 7MM SSD * Kingston, Samsung, SanDisk& PNY*120-250GB", 1.080, 9, 0.120, true), ("Lot of 10 Intel 256 GB 2.5\" SATA SSD different Model check the Description", 2.560, 10, 0.256, true), ("Bulk 5 Lot Samsung 870 EVO 500GB SSD SATA - Used - Tested Passed Smart Test", 2.500, 5, 0.500, false), ("Samsung 1.6TB NVME PCIe 3.0 x8 2.75\" SSD MZPLK1T6HCHP PM1725 Series TLC", 1.6, 1, 1.6, false), ("Micron 5100 MAX 1.84TB SATA 6Gb/s 2.5\" SSD MTFDDAK1T9TCC-1AR1ZABYY", 1.84, 1, 1.84, false), ("10-PACK 1TB SSD", 10.0, 10, 1.0, false), ("2TB SSD NVMe", 2.0, 1, 2.0, false), ("WD Blue 500GB Internal SSD SATA III 6Gb/s", 0.5, 1, 0.5, false), ("Lot of 2 Mixed Capacity SSDs (120GB, 240GB) CHECK DESCRIPTION", 0.24, 2, 0.12, true), ("Single Drive 1TB", 1.0, 1, 1.0, false), ("Lot of 3 - CHECK DESCRIPTION - Mixed SSDs", 0.0, 3, 0.0, true), ]; for (title, total_tb, quantity, ind_size_tb, check) in test_cases { println!("Testing title: {}", title); let result = parse_size_and_quantity(title); assert_sq_info_eq(result, total_tb, quantity, ind_size_tb, check); } } #[test] fn test_parse_price() { assert_eq!(parse_price("$19.99"), Some(19.99)); assert_eq!(parse_price("USD 150.00"), Some(150.00)); assert_eq!(parse_price("$1,234.56"), Some(1234.56)); assert_eq!(parse_price("Free"), None); assert_eq!(parse_price("$10.00 to $20.00"), Some(10.00)); assert_eq!(parse_price("EUR 25.50"), Some(25.50)); assert_eq!(parse_price("25.50"), Some(25.50)); } }