use crate::db::{ItemAppearances, Listing}; use chrono::Utc; use lazy_static::lazy_static; use regex::Regex; use scraper::{Html, Selector}; use tracing::{debug, info, warn}; lazy_static! { static ref PRICE_REGEX: Regex = Regex::new(r"\$?([\d,]+\.?\d*)").unwrap(); static ref ITEM_ID_REGEX: Regex = Regex::new(r"/itm/(\d+)").unwrap(); static ref NEW_LISTING_REGEX: Regex = Regex::new(r"(?i)^\s*NEW LISTING\s*[:\-\s]*").unwrap(); } /// Parses price from a string, taking the first price if it's a range. fn parse_price(price_text: &str) -> Option { let lower_price_text = price_text.to_lowercase(); if lower_price_text.contains(" to ") { if let Some(first_part) = lower_price_text.split(" to ").next() { if let Some(caps) = PRICE_REGEX.captures(first_part) { if let Some(price_match) = caps.get(1) { info!("Price string:{:?} parsed!", price_match); return price_match.as_str().replace(',', "").parse().ok(); } } } info!( "Price string:{:?} failed parsing with to, returning none.", price_text ); return None; } if let Some(caps) = PRICE_REGEX.captures(price_text) { if let Some(price_match) = caps.get(1) { let p = price_match.as_str().replace(',', "").parse().ok(); debug!( "price regex passed, working on caps:{:?}, price_match:{:?}, p:{:?}", caps, price_match, p ); return p; } } info!( "Price string:{:?} failed parsing, returning none.", price_text ); None } /// Extracts item data from HTML content. pub fn extract_data_from_html( html_content: &str, timestamp: &chrono::DateTime, category: &str, ) -> Option> { let document = Html::parse_document(html_content); let mut items = Vec::new(); let item_selector = Selector::parse("li.s-item, li.srp-results__item, div.s-item[role='listitem']").unwrap(); let title_selector = Selector::parse(".s-item__title, .srp-results__title").unwrap(); let price_selector = Selector::parse(".s-item__price").unwrap(); let image_selector = Selector::parse(".s-item__image-wrapper img.s-item__image-img, .s-item__image img") .unwrap(); let link_selector = Selector::parse("a.s-item__link[href*='/itm/'], .s-item__info > a[href*='/itm/']").unwrap(); let bid_count_selector = Selector::parse(".s-item__bid-count, .s-item__bids, .s-item__bidCount").unwrap(); let primary_info_selector = Selector::parse(".s-item__detail--primary").unwrap(); let _secondary_info_selector = Selector::parse(".s-item__detail--secondary").unwrap(); let auction_bin_price_selector = Selector::parse(".s-item__buy-it-now-price").unwrap(); for element in document.select(&item_selector) { let raw_title_text = element .select(&title_selector) .next() .map(|el| el.text().collect::().trim().to_string()); let price_text = element .select(&price_selector) .next() .map(|el| el.text().collect::().trim().to_string()); let id = element .select(&link_selector) .next() .and_then(|link_el| link_el.value().attr("href")) .and_then(|href| ITEM_ID_REGEX.captures(href)) .and_then(|caps| caps.get(1).map(|m| m.as_str().to_string())) .and_then(|id_str| id_str.parse::().ok()); if raw_title_text.is_none() || price_text.is_none() || id.is_none() { warn!( "Skipping {:?} due to missing title, price, or item ID.", element ); continue; } if id.unwrap() == 123456 { info!("Skipping {:?} due to bogus ID of 123456", element); continue; } let raw_title = raw_title_text.unwrap(); let price_text = price_text.unwrap(); let title = NEW_LISTING_REGEX.replace(&raw_title, "").trim().to_string(); let primary_display_price = parse_price(&price_text); let mut current_bid_price: Option = None; let mut final_buy_it_now_price: Option = None; let mut item_is_auction = false; if let Some(bid_el) = element.select(&bid_count_selector).next() { if bid_el .text() .collect::() .to_lowercase() .contains("bid") { item_is_auction = true; } } let has_best_offer = element .select(&primary_info_selector) .any(|e| e.text().any(|e| e.to_lowercase().contains("or best offer"))); if item_is_auction { current_bid_price = primary_display_price; if let Some(bin_el) = element.select(&auction_bin_price_selector).next() { final_buy_it_now_price = parse_price(&bin_el.text().collect::()); } } else { final_buy_it_now_price = primary_display_price; } let image_url = element .select(&image_selector) .next() .and_then(|img_el| { img_el .value() .attr("data-src") .or(img_el.value().attr("src")) }) .map(|s| s.to_string()) .unwrap(); items.push(( Listing { title, id: 0, item_id: id?, buy_it_now_price: final_buy_it_now_price, has_best_offer, image_url, }, ItemAppearances { item: id?, timestamp: *timestamp, current_bid_usd_cents: current_bid_price.map(|b| (b * 100.0).round() as i64), category: category.to_owned(), }, )); } Some(items) } #[cfg(test)] mod tests { use super::*; use similar_asserts::assert_eq; #[test_log::test] fn parse() { let timestamp = chrono::DateTime::from_timestamp(1750369463, 0).unwrap(); let html = include_str!("../test_data/ebay_scraper/raw_scraped/ssd/1750369463.html"); let parsed = extract_data_from_html(html, ×tamp, "ssd").unwrap(); // assert_eq!(parsed.len(), 62); let parsed = parsed.first_chunk::<10>().unwrap(); assert_eq!( parsed[0], ( Listing { id: 0, item_id: 388484391867, title: "WD Blue 2.5-Inch 3D NAND SATA SSD 1TB - WDBNCE0010PNC-WRSN".to_string(), buy_it_now_price: Some(59.99), has_best_offer: true, image_url: "https://i.ebayimg.com/images/g/wQYAAeSwOTtoN8SC/s-l500.webp" .to_string() }, ItemAppearances { item: 388484391867, timestamp: timestamp, category: "ssd".to_owned(), current_bid_usd_cents: None } ) ); assert_eq!( parsed[4], (Listing { id: 0, item_id: 286605201240, title: "Fanxiang M.2 SSD 1TB NVMe PCIe Gen 3x 4 M2 Internal Solid State Drive 3500MB/s" .to_string(), buy_it_now_price: None, has_best_offer: true, image_url: "https://i.ebayimg.com/images/g/3NoAAeSwPrtoDb1O/s-l500.webp" .to_string() }, ItemAppearances { item: 286605201240, timestamp: timestamp, category: "ssd".to_owned(), current_bid_usd_cents: Some(1260) }) ); } }