Rust cleanup (separate files, idiomatic rust, etc)

Moved ebay scraping into separate directory and added rust version
2025-05-29 01:04:37 -04:00 · 2025-05-29 00:29:53 -04:00
16 changed files with 3078 additions and 0 deletions
--- a/ebay_storage/javascript/.gitignore
+++ b/ebay_storage/javascript/.gitignore
@@ -0,0 +1 @@
+/node_modules
--- a/ebay_storage/javascript/ebay_command_line_tool.js
+++ b/ebay_storage/javascript/ebay_command_line_tool.js
--- a/ebay_storage/javascript/ebay_core.js
+++ b/ebay_storage/javascript/ebay_core.js
--- a/ebay_storage/javascript/ebay_hdd.js
+++ b/ebay_storage/javascript/ebay_hdd.js
--- a/ebay_storage/javascript/package.json
+++ b/ebay_storage/javascript/package.json
--- a/ebay_storage/javascript/readme.md
+++ b/ebay_storage/javascript/readme.md
--- a/ebay_storage/javascript/scraper_chrome.png
+++ b/ebay_storage/javascript/scraper_chrome.png
--- a/ebay_storage/javascript/yarn.lock
+++ b/ebay_storage/javascript/yarn.lock
--- a/ebay_storage/rust/.gitignore
+++ b/ebay_storage/rust/.gitignore
@@ -0,0 +1,2 @@
+target
+
--- a/ebay_storage/rust/Cargo.lock
+++ b/ebay_storage/rust/Cargo.lock
--- a/ebay_storage/rust/Cargo.toml
+++ b/ebay_storage/rust/Cargo.toml
@@ -0,0 +1,21 @@
+[package]
+name = "ebay_scraper_rust"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+clap = { version = "4.5.39", features = ["derive"] }
+reqwest = { version = "0.12.18", features = ["json", "stream"] } # Removed "blocking" as we use tokio
+scraper = "0.23.1"
+serde = { version = "1.0.219", features = ["derive"] }
+serde_json = "1.0.140"
+regex = "1.11.1"
+tokio = { version = "1.45.1", features = ["full"] }
+url = "2.5.4"
+# path-slash is not strictly needed if using std::path::PathBuf correctly
+bytes = "1.10.1"
+chrono = { version = "0.4.41", features = ["serde"] }
+lazy_static = "1.5.0"
+futures = "0.3.31" # For join_all on async tasks
+tracing = "0.1.41"
+tracing-subscriber = { version = "0.3.19", features = ["fmt"] }
--- a/ebay_storage/rust/src/cli.rs
+++ b/ebay_storage/rust/src/cli.rs
@@ -0,0 +1,45 @@
+// src/cli.rs
+use clap::Parser;
+
+#[derive(Parser, Debug)]
+#[clap(
+    name = "ebay-scraper-rust",
+    version = "0.1.0",
+    about = "Scrapes eBay search results for SSD/HDD cost per TB."
+)]
+pub struct Cli {
+    #[clap(subcommand)]
+    pub command: Option<Commands>,
+
+    /// The full eBay search URL to scrape.
+    pub url: Option<String>,
+
+    /// Save scraped HTML to a file (and download images if fetching from URL).
+    #[clap(long)]
+    pub save: Option<String>,
+
+    /// Load HTML from a file (disables network). Image download will not occur with --load.
+    #[clap(long)]
+    pub load: Option<String>,
+
+    /// Suppress informational logs, output only final JSON.
+    #[clap(long)]
+    pub only_json: bool,
+}
+
+#[derive(Parser, Debug)]
+pub enum Commands {
+    /// Scrapes latest listings.
+    Latest(LatestArgs),
+}
+
+#[derive(Parser, Debug)]
+pub struct LatestArgs {
+    /// Items per page (60, 120, or 240)
+    #[clap(long, default_value = "60")]
+    pub per_page: String,
+
+    /// Minimum cost (e.g., 50.00)
+    #[clap(long, default_value = "0.00")]
+    pub minimum_cost: f64,
+}
--- a/ebay_storage/rust/src/html_utils.rs
+++ b/ebay_storage/rust/src/html_utils.rs
@@ -0,0 +1,250 @@
+// src/html_utils.rs
+use chrono::Utc;
+use lazy_static::lazy_static;
+use regex::Regex;
+use scraper::{Html, Selector};
+use std::error::Error as StdError; // Use the same alias as main
+use std::fs::{self, File};
+use std::io::Write;
+use std::path::Path;
+use tracing::{error, info, warn};
+use url::Url;
+
+use super::item::{EbayItem, ParsedItemData};
+use super::parser_utils;
+
+// Define or import AppError to match main.rs
+type AppError = Box<dyn StdError + Send + Sync + 'static>;
+
+const PARSER_ENGINE_VERSION: i32 = 1;
+const USER_AGENT: &str = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36";
+
+lazy_static! {
+    static ref ITEM_ID_REGEX: Regex = Regex::new(r"/itm/(\d+)").unwrap();
+    static ref NEW_LISTING_REGEX: Regex = Regex::new(r"(?i)^\s*NEW LISTING\s*[:\-\s]*").unwrap();
+}
+
+/// Fetches HTML content from a URL.
+pub async fn fetch_html(url: &str) -> Result<String, AppError> {
+    info!(target_url = url, "Navigating to URL");
+    let client = reqwest::Client::builder().user_agent(USER_AGENT).build()?;
+    let response = client.get(url).send().await?;
+    if !response.status().is_success() {
+        let err_msg = format!(
+            "Failed to fetch URL: {} - Status: {}",
+            url,
+            response.status()
+        );
+        error!(error_message = %err_msg, "URL fetch failed");
+        return Err(err_msg.into());
+    }
+    let html_content = response.text().await?;
+    info!(
+        target_url = url,
+        "Navigation successful. Page content retrieved."
+    );
+    Ok(html_content)
+}
+
+/// Extracts item data from HTML content.
+pub fn extract_data_from_html(html_content: &str) -> Result<Vec<EbayItem>, AppError> {
+    let document = Html::parse_document(html_content);
+    let mut items = Vec::new();
+    let today = Utc::now();
+
+    // MODIFIED: Using .unwrap() for Selector::parse() calls as requested
+    let item_selector =
+        Selector::parse("li.s-item, li.srp-results__item, div.s-item[role='listitem']").unwrap();
+    let title_selector = Selector::parse(".s-item__title, .srp-results__title").unwrap();
+    let price_selector = Selector::parse(".s-item__price").unwrap();
+    let image_selector =
+        Selector::parse(".s-item__image-wrapper img.s-item__image-img, .s-item__image img")
+            .unwrap();
+    let link_selector =
+        Selector::parse("a.s-item__link[href*='/itm/'], .s-item__info > a[href*='/itm/']").unwrap();
+    let bid_count_selector = Selector::parse(".s-item__bid-count").unwrap();
+    let best_offer_selector =
+        Selector::parse(".s-item__purchase-options--bo, .s-item__best-offer").unwrap();
+    let secondary_info_selector =
+        Selector::parse(".s-item__subtitle, .s-item__secondary-text, .s-item__detail--secondary")
+            .unwrap();
+    let auction_bin_price_selector = Selector::parse(".s-item__buy-it-now-price").unwrap();
+
+    for element in document.select(&item_selector) {
+        let raw_title_text = element
+            .select(&title_selector)
+            .next()
+            .map(|el| el.text().collect::<String>().trim().to_string());
+        let price_text = element
+            .select(&price_selector)
+            .next()
+            .map(|el| el.text().collect::<String>().trim().to_string());
+
+        let item_id = element
+            .select(&link_selector)
+            .next()
+            .and_then(|link_el| link_el.value().attr("href"))
+            .and_then(|href| ITEM_ID_REGEX.captures(href))
+            .and_then(|caps| caps.get(1).map(|m| m.as_str().to_string()));
+
+        if raw_title_text.is_none() || price_text.is_none() || item_id.is_none() {
+            warn!("Skipping item due to missing title, price, or item ID.");
+            continue;
+        }
+        let raw_title = raw_title_text.unwrap();
+        let price_text = price_text.unwrap();
+        let item_id = item_id.unwrap();
+
+        let cleaned_title = NEW_LISTING_REGEX.replace(&raw_title, "").trim().to_string();
+
+        let primary_display_price = parser_utils::parse_price(&price_text);
+
+        let mut current_bid_price: Option<f64> = None;
+        let mut final_buy_it_now_price: Option<f64> = None;
+        let mut has_best_offer = false;
+        let mut item_is_auction = false;
+
+        if let Some(bid_el) = element.select(&bid_count_selector).next() {
+            if bid_el
+                .text()
+                .collect::<String>()
+                .to_lowercase()
+                .contains("bid")
+            {
+                item_is_auction = true;
+            }
+        }
+
+        if element.select(&best_offer_selector).next().is_some() {
+            has_best_offer = true;
+        } else {
+            for el in element.select(&secondary_info_selector) {
+                if el
+                    .text()
+                    .collect::<String>()
+                    .to_lowercase()
+                    .contains("or best offer")
+                {
+                    has_best_offer = true;
+                    break;
+                }
+            }
+        }
+
+        if item_is_auction {
+            current_bid_price = primary_display_price;
+            if let Some(bin_el) = element.select(&auction_bin_price_selector).next() {
+                final_buy_it_now_price =
+                    parser_utils::parse_price(&bin_el.text().collect::<String>());
+            }
+        } else {
+            final_buy_it_now_price = primary_display_price;
+        }
+
+        let image_url_val = element
+            .select(&image_selector)
+            .next()
+            .and_then(|img_el| {
+                img_el
+                    .value()
+                    .attr("data-src")
+                    .or(img_el.value().attr("src"))
+            })
+            .map(|s| s.to_string());
+
+        let parsed_size_info = parser_utils::parse_size_and_quantity(&cleaned_title);
+
+        let cost_per_tb = if let Some(price) = primary_display_price {
+            if parsed_size_info.total_tb > 0.0 {
+                Some(((price / parsed_size_info.total_tb) * 100.0).round() / 100.0)
+            } else {
+                None
+            }
+        } else {
+            None
+        };
+
+        let parsed_data = ParsedItemData {
+            item_count: parsed_size_info.quantity,
+            size_per_item_tb: if parsed_size_info.individual_size_tb > 0.0 {
+                Some(parsed_size_info.individual_size_tb)
+            } else {
+                None
+            },
+            total_tb: if parsed_size_info.total_tb > 0.0 {
+                Some(parsed_size_info.total_tb)
+            } else {
+                None
+            },
+            cost_per_tb,
+            needed_description_check: parsed_size_info.needed_description_check,
+            parser_engine: PARSER_ENGINE_VERSION,
+        };
+
+        items.push(EbayItem {
+            title: cleaned_title,
+            item_id,
+            date_found: today,
+            current_bid_price,
+            buy_it_now_price: final_buy_it_now_price,
+            has_best_offer,
+            image_url: image_url_val,
+            parsed: parsed_data,
+        });
+    }
+    Ok(items)
+}
+
+/// Downloads an image from a URL and saves it, preserving path structure.
+pub async fn download_image(
+    image_url_str: &str,
+    base_save_directory: &Path,
+) -> Result<(), AppError> {
+    if image_url_str.is_empty() {
+        return Ok(());
+    }
+
+    let parsed_url = Url::parse(image_url_str).map_err(|e| Box::new(e) as AppError)?;
+
+    let image_path_from_url = parsed_url.path().trim_start_matches('/');
+    if image_path_from_url.is_empty() {
+        return Err(Box::from("Image URL has no path component") as AppError);
+    }
+
+    let full_local_image_path = base_save_directory.join(image_path_from_url);
+
+    if let Some(parent_dir) = full_local_image_path.parent() {
+        fs::create_dir_all(parent_dir).map_err(|e| Box::new(e) as AppError)?;
+        info!(path = %parent_dir.display(), "Ensured image directory exists");
+    }
+
+    let client = reqwest::Client::builder()
+        .user_agent(USER_AGENT)
+        .build()
+        .map_err(|e| Box::new(e) as AppError)?;
+    let response = client
+        .get(image_url_str)
+        .send()
+        .await
+        .map_err(|e| Box::new(e) as AppError)?;
+
+    if !response.status().is_success() {
+        let err_msg = format!(
+            "Failed to download image {}. Status: {}",
+            image_url_str,
+            response.status()
+        );
+        return Err(Box::from(err_msg) as AppError);
+    }
+
+    let mut file = File::create(&full_local_image_path).map_err(|e| Box::new(e) as AppError)?;
+    let content = response
+        .bytes()
+        .await
+        .map_err(|e| Box::new(e) as AppError)?;
+    file.write_all(&content)
+        .map_err(|e| Box::new(e) as AppError)?;
+
+    info!(path = %full_local_image_path.display(), "Downloaded image");
+    Ok(())
+}
--- a/ebay_storage/rust/src/item.rs
+++ b/ebay_storage/rust/src/item.rs
@@ -0,0 +1,45 @@
+// src/item.rs
+use chrono::{DateTime, Utc};
+use serde::Serialize;
+
+#[derive(Serialize, Debug)]
+pub struct EbayItem {
+    pub title: String,
+    #[serde(rename = "itemId")]
+    pub item_id: String,
+    #[serde(rename = "dateFound")]
+    pub date_found: DateTime<Utc>,
+    #[serde(rename = "currentBidPrice")]
+    pub current_bid_price: Option<f64>,
+    #[serde(rename = "buyItNowPrice", skip_serializing_if = "Option::is_none")]
+    pub buy_it_now_price: Option<f64>,
+    #[serde(rename = "hasBestOffer")]
+    pub has_best_offer: bool,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_url: Option<String>,
+    pub parsed: ParsedItemData,
+}
+
+#[derive(Serialize, Debug)]
+pub struct ParsedItemData {
+    #[serde(rename = "itemCount")]
+    pub item_count: i32,
+    #[serde(rename = "sizePerItemTB")]
+    pub size_per_item_tb: Option<f64>,
+    #[serde(rename = "totalTB")]
+    pub total_tb: Option<f64>,
+    #[serde(rename = "costPerTB")]
+    pub cost_per_tb: Option<f64>,
+    #[serde(rename = "needed_description_check")]
+    pub needed_description_check: bool,
+    #[serde(rename = "parser_engine")]
+    pub parser_engine: i32,
+}
+
+#[derive(Debug)]
+pub struct SizeQuantityInfo {
+    pub total_tb: f64,
+    pub quantity: i32,
+    pub individual_size_tb: f64,
+    pub needed_description_check: bool,
+}
--- a/ebay_storage/rust/src/main.rs
+++ b/ebay_storage/rust/src/main.rs
@@ -0,0 +1,176 @@
+// src/main.rs
+mod cli;
+mod item;
+mod parser_utils;
+mod html_utils;
+
+use clap::Parser;
+use std::fs;
+use std::io::Write; 
+use std::path::PathBuf; 
+use std::error::Error as StdError; 
+use tracing::{info, error, warn, Level}; 
+use tracing_subscriber; 
+
+use cli::{Cli, Commands}; 
+use item::EbayItem; 
+
+// Define a more specific error type for the application
+// This type is now implicitly used by html_utils.rs as well due to function signatures.
+type AppError = Box<dyn StdError + Send + Sync + 'static>;
+
+#[tokio::main]
+async fn main() -> Result<(), AppError> { 
+    let cli_args = Cli::parse();
+
+    let subscriber_builder = tracing_subscriber::fmt().with_writer(std::io::stderr);
+    if cli_args.only_json {
+        subscriber_builder
+            .with_max_level(Level::ERROR) 
+            .try_init()?; 
+    } else {
+        subscriber_builder
+            .with_max_level(Level::INFO) 
+            .try_init()?; 
+    }
+
+    info!("Starting scraping process...");
+
+    let html_content_to_parse: String;
+    let mut should_download_images = false;
+    let mut image_base_save_dir: Option<PathBuf> = None;
+
+    if let Some(html_file) = &cli_args.load {
+        info!(file_path = %html_file, "Loading HTML from file");
+        html_content_to_parse = fs::read_to_string(html_file)?; // std::io::Error converts to AppError via ?
+        info!("HTML loaded. Network requests for page content disabled.");
+    } else {
+        let url_to_fetch = match (&cli_args.command, &cli_args.url) {
+            (Some(Commands::Latest(latest_args)), _) => {
+                let valid_per_page = ["60", "120", "240"];
+                if !valid_per_page.contains(&latest_args.per_page.as_str()) {
+                    let err_msg = format!("--per_page must be one of {}, got {}", valid_per_page.join(", "), latest_args.per_page);
+                    error!(error_message = %err_msg, "Invalid per_page argument");
+                    return Err(err_msg.into()); // String converts to AppError
+                }
+                if latest_args.minimum_cost < 0.0 {
+                    let err_msg = "--minimum_cost must be a non-negative number.";
+                    error!(error_message = %err_msg, "Invalid minimum_cost argument");
+                    return Err(err_msg.into()); // String converts to AppError
+                }
+                let base_url = "https://www.ebay.com/sch/i.html?_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10";
+                let url = format!("{}&_ipg={}&_udlo={:.2}", base_url, latest_args.per_page, latest_args.minimum_cost);
+                info!(constructed_url = %url, "Constructed URL for 'latest' command");
+                url
+            }
+            (None, Some(url_arg)) => {
+                url_arg.clone()
+            }
+            (None, None) => {
+                let err_msg = "No URL provided and no command specified. Use --help for usage.";
+                eprintln!("ERROR: {}", err_msg); 
+                return Err(err_msg.into()); // String converts to AppError
+            }
+        };
+
+        html_content_to_parse = html_utils::fetch_html(&url_to_fetch).await?; // This now returns Result<String, AppError>
+
+        if let Some(save_path_str) = &cli_args.save {
+            info!(file_path = %save_path_str, "Saving HTML to file");
+            let mut file = fs::File::create(save_path_str)?; // std::io::Error converts
+            file.write_all(html_content_to_parse.as_bytes())?; // std::io::Error converts
+            info!("HTML saved.");
+
+            should_download_images = true;
+            let save_file_path = PathBuf::from(save_path_str);
+            let base_name = save_file_path.file_stem().unwrap_or_default().to_string_lossy().to_string();
+            if let Some(parent_dir) = save_file_path.parent() {
+                image_base_save_dir = Some(parent_dir.join(base_name));
+            } else {
+                image_base_save_dir = Some(PathBuf::from(base_name));
+            }
+        }
+    }
+
+    info!("Extracting data...");
+    let extracted_results: Vec<EbayItem> = html_utils::extract_data_from_html(&html_content_to_parse)?; // This now returns Result<Vec<EbayItem>, AppError>
+    info!(item_count = extracted_results.len(), "Data extraction complete.");
+
+    if should_download_images && !extracted_results.is_empty() {
+        if let Some(img_base_dir) = image_base_save_dir {
+            info!(directory = %img_base_dir.display(), "Downloading images");
+            
+            let mut download_futures = Vec::new();
+            for item in &extracted_results {
+                if let Some(img_url) = &item.image_url {
+                    let img_base_dir_clone = img_base_dir.clone();
+                    let img_url_clone = img_url.clone(); 
+                    let item_id_clone = item.item_id.clone(); 
+
+                    download_futures.push(async move {
+                        if let Err(e) = html_utils::download_image(&img_url_clone, &img_base_dir_clone).await { // This now returns Result<(), AppError>
+                            warn!(item_id = %item_id_clone, image_url = %img_url_clone, error = %e, "Skipping image download due to error");
+                        }
+                    });
+                }
+            }
+            futures::future::join_all(download_futures).await;
+            info!("Image download process finished.");
+        }
+    }
+
+    if cli_args.only_json {
+        println!("{}", serde_json::to_string(&extracted_results)?); // serde_json::Error converts
+    } else {
+        println!("{}", serde_json::to_string_pretty(&extracted_results)?); // serde_json::Error converts
+    }
+
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::parser_utils::*; 
+    use super::item::SizeQuantityInfo; 
+
+    fn assert_sq_info_eq(actual: SizeQuantityInfo, expected_total_tb: f64, expected_quantity: i32, expected_ind_size_tb: f64, expected_check: bool) {
+        assert!((actual.total_tb - expected_total_tb).abs() < 0.0001, "TotalTB mismatch. Expected: {}, Got: {}", expected_total_tb, actual.total_tb);
+        assert_eq!(actual.quantity, expected_quantity, "Quantity mismatch");
+        assert!((actual.individual_size_tb - expected_ind_size_tb).abs() < 0.0001, "IndividualSizeTB mismatch. Expected: {}, Got: {}", expected_ind_size_tb, actual.individual_size_tb);
+        assert_eq!(actual.needed_description_check, expected_check, "NeededDescriptionCheck mismatch");
+    }
+
+    #[test]
+    fn test_parse_size_and_quantity() {
+        let test_cases = vec![
+            ("LOT OF (9) MAJOR BRAND 2.5\" 7MM SSD * Kingston, Samsung, SanDisk& PNY*120-250GB", 1.080, 9, 0.120, true),
+            ("Lot of 10 Intel 256 GB 2.5\" SATA SSD different Model check the Description", 2.560, 10, 0.256, true),
+            ("Bulk 5 Lot Samsung 870 EVO 500GB SSD SATA - Used - Tested Passed Smart Test", 2.500, 5, 0.500, false),
+            ("Samsung 1.6TB NVME PCIe 3.0 x8 2.75\" SSD MZPLK1T6HCHP PM1725 Series TLC", 1.6, 1, 1.6, false),
+            ("Micron 5100 MAX 1.84TB SATA 6Gb/s 2.5\" SSD MTFDDAK1T9TCC-1AR1ZABYY", 1.84, 1, 1.84, false),
+            ("10-PACK 1TB SSD", 10.0, 10, 1.0, false),
+            ("2TB SSD NVMe", 2.0, 1, 2.0, false),
+            ("WD Blue 500GB Internal SSD SATA III 6Gb/s", 0.5, 1, 0.5, false),
+            ("Lot of 2 Mixed Capacity SSDs (120GB, 240GB) CHECK DESCRIPTION", 0.24, 2, 0.12, true), 
+            ("Single Drive 1TB", 1.0, 1, 1.0, false),
+            ("Lot of 3 - CHECK DESCRIPTION - Mixed SSDs", 0.0, 3, 0.0, true), 
+        ];
+
+        for (title, total_tb, quantity, ind_size_tb, check) in test_cases {
+            tracing::debug!(testing_title = %title, "Running test_parse_size_and_quantity"); 
+            let result = parse_size_and_quantity(title);
+            assert_sq_info_eq(result, total_tb, quantity, ind_size_tb, check);
+        }
+    }
+
+    #[test]
+    fn test_parse_price() {
+        assert_eq!(parse_price("$19.99"), Some(19.99));
+        assert_eq!(parse_price("USD 150.00"), Some(150.00));
+        assert_eq!(parse_price("$1,234.56"), Some(1234.56));
+        assert_eq!(parse_price("Free"), None);
+        assert_eq!(parse_price("$10.00 to $20.00"), Some(10.00));
+        assert_eq!(parse_price("EUR 25.50"), Some(25.50));
+        assert_eq!(parse_price("25.50"), Some(25.50));
+    }
+}
--- a/ebay_storage/rust/src/parser_utils.rs
+++ b/ebay_storage/rust/src/parser_utils.rs
@@ -0,0 +1,122 @@
+// src/parser_utils.rs
+use super::item::SizeQuantityInfo;
+use lazy_static::lazy_static;
+use regex::Regex; // Assuming item.rs is in the same directory (src)
+
+lazy_static! {
+    static ref EXPLICIT_QTY_PATTERNS: Vec<Regex> = vec![
+        Regex::new(r"\b(?:LOT\s+OF|LOT)\s*\(?\s*(\d+)\s*\)?").unwrap(),
+        Regex::new(r"\b(?:LOT\s+OF|LOT)\s*\*\s*(\d+)").unwrap(),
+        Regex::new(r"\b(?:PACK\s+OF|PACK|BULK)\s*\(?\s*(\d+)\s*\)?").unwrap(),
+        Regex::new(r"\b(\d+)\s*-\s*PACK\b").unwrap(),
+        Regex::new(r"\b(\d+)\s*COUNT\b").unwrap(),
+    ];
+    static ref SIZE_REGEX: Regex = Regex::new(r"(\d+(?:\.\d+)?)\s*(TB|GB)\b").unwrap();
+    static ref SIZE_RANGE_REGEX: Regex =
+        Regex::new(r"\d+(?:\.\d+)?\s*(?:GB|TB)\s*(?:-|&|OR|TO)\s*\d+(?:\.\d+)?\s*(?:GB|TB)")
+            .unwrap();
+    static ref PRICE_REGEX: Regex = Regex::new(r"\$?([\d,]+\.?\d*)").unwrap();
+}
+
+/// Parses size and quantity information from an item title.
+pub fn parse_size_and_quantity(title: &str) -> SizeQuantityInfo {
+    let upper_title = title.to_uppercase();
+    let mut total_tb = 0.0;
+    let mut quantity = 1;
+    let mut needed_description_check = false;
+    let mut individual_size_tb = 0.0;
+
+    for pattern in EXPLICIT_QTY_PATTERNS.iter() {
+        if let Some(caps) = pattern.captures(&upper_title) {
+            if let Some(qty_match) = caps.get(1) {
+                if let Ok(parsed_qty) = qty_match.as_str().parse::<i32>() {
+                    if parsed_qty > 0 && parsed_qty < 500 {
+                        quantity = parsed_qty;
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    let mut size_matches: Vec<(f64, String)> = Vec::new();
+    for caps in SIZE_REGEX.captures_iter(&upper_title) {
+        if let (Some(val_str), Some(unit_str)) = (caps.get(1), caps.get(2)) {
+            if let Ok(val) = val_str.as_str().parse::<f64>() {
+                size_matches.push((val, unit_str.as_str().to_string()));
+            }
+        }
+    }
+
+    if !size_matches.is_empty() {
+        let mut unique_sizes_tb: Vec<f64> = size_matches
+            .iter()
+            .map(|(val, unit)| if unit == "GB" { *val / 1000.0 } else { *val })
+            .collect();
+        unique_sizes_tb.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
+        unique_sizes_tb.dedup();
+
+        if !unique_sizes_tb.is_empty() {
+            individual_size_tb = unique_sizes_tb[0];
+            if unique_sizes_tb.len() > 1 {
+                needed_description_check = true;
+            }
+        }
+    }
+
+    if SIZE_RANGE_REGEX.is_match(&upper_title) {
+        needed_description_check = true;
+    }
+    if quantity > 1 && upper_title.contains("MIXED") {
+        needed_description_check = true;
+    }
+    if upper_title.contains("CHECK THE DESCRIPTION")
+        || upper_title.contains("CHECK DESCRIPTION")
+        || upper_title.contains("SEE DESCRIPTION")
+    {
+        if quantity > 1 || size_matches.is_empty() || size_matches.len() > 1 {
+            needed_description_check = true;
+        }
+    }
+
+    if individual_size_tb > 0.0 {
+        total_tb = individual_size_tb * quantity as f64;
+    }
+
+    if quantity > 1 && total_tb == 0.0 && !size_matches.is_empty() {
+        needed_description_check = true;
+    }
+
+    if quantity == 1 && size_matches.len() == 1 && !needed_description_check {
+        // This condition is implicitly handled
+    }
+
+    SizeQuantityInfo {
+        total_tb: (total_tb * 10000.0).round() / 10000.0,
+        quantity,
+        individual_size_tb: (individual_size_tb * 10000.0).round() / 10000.0,
+        needed_description_check,
+    }
+}
+
+/// Parses price from a string, taking the first price if it's a range.
+pub fn parse_price(price_text: &str) -> Option<f64> {
+    let lower_price_text = price_text.to_lowercase();
+    if lower_price_text.contains(" to ") {
+        if let Some(first_part) = lower_price_text.split(" to ").next() {
+            if let Some(caps) = PRICE_REGEX.captures(first_part) {
+                if let Some(price_match) = caps.get(1) {
+                    return price_match.as_str().replace(',', "").parse().ok();
+                }
+            }
+        }
+        return None;
+    }
+
+    if let Some(caps) = PRICE_REGEX.captures(price_text) {
+        if let Some(price_match) = caps.get(1) {
+            return price_match.as_str().replace(',', "").parse().ok();
+        }
+    }
+    None
+}
Author	SHA1	Message	Date
hak8or	b94d445cdd	Rust cleanup (separate files, idiomatic rust, etc)	2025-05-29 01:04:37 -04:00
hak8or	c1af98d1c2	Moved ebay scraping into separate directory and added rust version	2025-05-29 00:29:53 -04:00