Files
ebay_scraper_rust/src/parser_ebay.rs
hak8or 817b1d6275
All checks were successful
Cargo Build & Test / Rust project - latest (1.88) (push) Successful in 3m30s
Cargo Build & Test / Rust project - latest (1.87) (push) Successful in 3m57s
Cargo Build & Test / Rust project - latest (1.85.1) (push) Successful in 4m9s
Cargo Build & Test / Rust project - latest (1.86) (push) Successful in 9m50s
Add ItemAppearances to track price and page history
2025-06-28 01:00:28 -04:00

226 lines
7.9 KiB
Rust

use crate::db::{ItemAppearances, Listing};
use chrono::Utc;
use lazy_static::lazy_static;
use regex::Regex;
use scraper::{Html, Selector};
use tracing::{debug, info, warn};
lazy_static! {
static ref PRICE_REGEX: Regex = Regex::new(r"\$?([\d,]+\.?\d*)").unwrap();
static ref ITEM_ID_REGEX: Regex = Regex::new(r"/itm/(\d+)").unwrap();
static ref NEW_LISTING_REGEX: Regex = Regex::new(r"(?i)^\s*NEW LISTING\s*[:\-\s]*").unwrap();
}
/// Parses price from a string, taking the first price if it's a range.
fn parse_price(price_text: &str) -> Option<f64> {
let lower_price_text = price_text.to_lowercase();
if lower_price_text.contains(" to ") {
if let Some(first_part) = lower_price_text.split(" to ").next() {
if let Some(caps) = PRICE_REGEX.captures(first_part) {
if let Some(price_match) = caps.get(1) {
info!("Price string:{:?} parsed!", price_match);
return price_match.as_str().replace(',', "").parse().ok();
}
}
}
info!(
"Price string:{:?} failed parsing with to, returning none.",
price_text
);
return None;
}
if let Some(caps) = PRICE_REGEX.captures(price_text) {
if let Some(price_match) = caps.get(1) {
let p = price_match.as_str().replace(',', "").parse().ok();
debug!(
"price regex passed, working on caps:{:?}, price_match:{:?}, p:{:?}",
caps, price_match, p
);
return p;
}
}
info!(
"Price string:{:?} failed parsing, returning none.",
price_text
);
None
}
/// Extracts item data from HTML content.
pub fn extract_data_from_html(
html_content: &str,
timestamp: &chrono::DateTime<Utc>,
category: &str,
) -> Option<Vec<(Listing, ItemAppearances)>> {
let document = Html::parse_document(html_content);
let mut items = Vec::new();
let item_selector =
Selector::parse("li.s-item, li.srp-results__item, div.s-item[role='listitem']").unwrap();
let title_selector = Selector::parse(".s-item__title, .srp-results__title").unwrap();
let price_selector = Selector::parse(".s-item__price").unwrap();
let image_selector =
Selector::parse(".s-item__image-wrapper img.s-item__image-img, .s-item__image img")
.unwrap();
let link_selector =
Selector::parse("a.s-item__link[href*='/itm/'], .s-item__info > a[href*='/itm/']").unwrap();
let bid_count_selector =
Selector::parse(".s-item__bid-count, .s-item__bids, .s-item__bidCount").unwrap();
let primary_info_selector = Selector::parse(".s-item__detail--primary").unwrap();
let _secondary_info_selector = Selector::parse(".s-item__detail--secondary").unwrap();
let auction_bin_price_selector = Selector::parse(".s-item__buy-it-now-price").unwrap();
for element in document.select(&item_selector) {
let raw_title_text = element
.select(&title_selector)
.next()
.map(|el| el.text().collect::<String>().trim().to_string());
let price_text = element
.select(&price_selector)
.next()
.map(|el| el.text().collect::<String>().trim().to_string());
let id = element
.select(&link_selector)
.next()
.and_then(|link_el| link_el.value().attr("href"))
.and_then(|href| ITEM_ID_REGEX.captures(href))
.and_then(|caps| caps.get(1).map(|m| m.as_str().to_string()))
.and_then(|id_str| id_str.parse::<i64>().ok());
if raw_title_text.is_none() || price_text.is_none() || id.is_none() {
warn!(
"Skipping {:?} due to missing title, price, or item ID.",
element
);
continue;
}
if id.unwrap() == 123456 {
info!("Skipping {:?} due to bogus ID of 123456", element);
continue;
}
let raw_title = raw_title_text.unwrap();
let price_text = price_text.unwrap();
let title = NEW_LISTING_REGEX.replace(&raw_title, "").trim().to_string();
let primary_display_price = parse_price(&price_text);
let mut current_bid_price: Option<f64> = None;
let mut final_buy_it_now_price: Option<f64> = None;
let mut item_is_auction = false;
if let Some(bid_el) = element.select(&bid_count_selector).next() {
if bid_el
.text()
.collect::<String>()
.to_lowercase()
.contains("bid")
{
item_is_auction = true;
}
}
let has_best_offer = element
.select(&primary_info_selector)
.any(|e| e.text().any(|e| e.to_lowercase().contains("or best offer")));
if item_is_auction {
current_bid_price = primary_display_price;
if let Some(bin_el) = element.select(&auction_bin_price_selector).next() {
final_buy_it_now_price = parse_price(&bin_el.text().collect::<String>());
}
} else {
final_buy_it_now_price = primary_display_price;
}
let image_url = element
.select(&image_selector)
.next()
.and_then(|img_el| {
img_el
.value()
.attr("data-src")
.or(img_el.value().attr("src"))
})
.map(|s| s.to_string())
.unwrap();
items.push((
Listing {
title,
id: 0,
item_id: id?,
buy_it_now_price: final_buy_it_now_price,
has_best_offer,
image_url,
},
ItemAppearances {
item: id?,
timestamp: *timestamp,
current_bid_usd_cents: current_bid_price.map(|b| (b * 100.0).round() as i64),
category: category.to_owned(),
},
));
}
Some(items)
}
#[cfg(test)]
mod tests {
use super::*;
use similar_asserts::assert_eq;
#[test_log::test]
fn parse() {
let timestamp = chrono::DateTime::from_timestamp(1750369463, 0).unwrap();
let html = include_str!("../test_data/ebay_scraper/raw_scraped/ssd/1750369463.html");
let parsed = extract_data_from_html(html, &timestamp, "ssd").unwrap();
// assert_eq!(parsed.len(), 62);
let parsed = parsed.first_chunk::<10>().unwrap();
assert_eq!(
parsed[0],
(
Listing {
id: 0,
item_id: 388484391867,
title: "WD Blue 2.5-Inch 3D NAND SATA SSD 1TB - WDBNCE0010PNC-WRSN".to_string(),
buy_it_now_price: Some(59.99),
has_best_offer: true,
image_url: "https://i.ebayimg.com/images/g/wQYAAeSwOTtoN8SC/s-l500.webp"
.to_string()
},
ItemAppearances {
item: 388484391867,
timestamp: timestamp,
category: "ssd".to_owned(),
current_bid_usd_cents: None
}
)
);
assert_eq!(
parsed[4],
(Listing {
id: 0,
item_id: 286605201240,
title:
"Fanxiang M.2 SSD 1TB NVMe PCIe Gen 3x 4 M2 Internal Solid State Drive 3500MB/s"
.to_string(),
buy_it_now_price: None,
has_best_offer: true,
image_url: "https://i.ebayimg.com/images/g/3NoAAeSwPrtoDb1O/s-l500.webp"
.to_string()
}, ItemAppearances {
item: 286605201240,
timestamp: timestamp,
category: "ssd".to_owned(),
current_bid_usd_cents: Some(1260)
})
);
}
}