All checks were successful
Cargo Build & Test / Rust project - latest (1.88) (push) Successful in 3m30s
Cargo Build & Test / Rust project - latest (1.87) (push) Successful in 3m57s
Cargo Build & Test / Rust project - latest (1.85.1) (push) Successful in 4m9s
Cargo Build & Test / Rust project - latest (1.86) (push) Successful in 9m50s
226 lines
7.9 KiB
Rust
226 lines
7.9 KiB
Rust
use crate::db::{ItemAppearances, Listing};
|
|
use chrono::Utc;
|
|
use lazy_static::lazy_static;
|
|
use regex::Regex;
|
|
use scraper::{Html, Selector};
|
|
use tracing::{debug, info, warn};
|
|
|
|
lazy_static! {
|
|
static ref PRICE_REGEX: Regex = Regex::new(r"\$?([\d,]+\.?\d*)").unwrap();
|
|
static ref ITEM_ID_REGEX: Regex = Regex::new(r"/itm/(\d+)").unwrap();
|
|
static ref NEW_LISTING_REGEX: Regex = Regex::new(r"(?i)^\s*NEW LISTING\s*[:\-\s]*").unwrap();
|
|
}
|
|
|
|
/// Parses price from a string, taking the first price if it's a range.
|
|
fn parse_price(price_text: &str) -> Option<f64> {
|
|
let lower_price_text = price_text.to_lowercase();
|
|
if lower_price_text.contains(" to ") {
|
|
if let Some(first_part) = lower_price_text.split(" to ").next() {
|
|
if let Some(caps) = PRICE_REGEX.captures(first_part) {
|
|
if let Some(price_match) = caps.get(1) {
|
|
info!("Price string:{:?} parsed!", price_match);
|
|
return price_match.as_str().replace(',', "").parse().ok();
|
|
}
|
|
}
|
|
}
|
|
info!(
|
|
"Price string:{:?} failed parsing with to, returning none.",
|
|
price_text
|
|
);
|
|
return None;
|
|
}
|
|
|
|
if let Some(caps) = PRICE_REGEX.captures(price_text) {
|
|
if let Some(price_match) = caps.get(1) {
|
|
let p = price_match.as_str().replace(',', "").parse().ok();
|
|
debug!(
|
|
"price regex passed, working on caps:{:?}, price_match:{:?}, p:{:?}",
|
|
caps, price_match, p
|
|
);
|
|
return p;
|
|
}
|
|
}
|
|
|
|
info!(
|
|
"Price string:{:?} failed parsing, returning none.",
|
|
price_text
|
|
);
|
|
None
|
|
}
|
|
|
|
/// Extracts item data from HTML content.
|
|
pub fn extract_data_from_html(
|
|
html_content: &str,
|
|
timestamp: &chrono::DateTime<Utc>,
|
|
category: &str,
|
|
) -> Option<Vec<(Listing, ItemAppearances)>> {
|
|
let document = Html::parse_document(html_content);
|
|
let mut items = Vec::new();
|
|
|
|
let item_selector =
|
|
Selector::parse("li.s-item, li.srp-results__item, div.s-item[role='listitem']").unwrap();
|
|
let title_selector = Selector::parse(".s-item__title, .srp-results__title").unwrap();
|
|
let price_selector = Selector::parse(".s-item__price").unwrap();
|
|
let image_selector =
|
|
Selector::parse(".s-item__image-wrapper img.s-item__image-img, .s-item__image img")
|
|
.unwrap();
|
|
let link_selector =
|
|
Selector::parse("a.s-item__link[href*='/itm/'], .s-item__info > a[href*='/itm/']").unwrap();
|
|
let bid_count_selector =
|
|
Selector::parse(".s-item__bid-count, .s-item__bids, .s-item__bidCount").unwrap();
|
|
let primary_info_selector = Selector::parse(".s-item__detail--primary").unwrap();
|
|
let _secondary_info_selector = Selector::parse(".s-item__detail--secondary").unwrap();
|
|
let auction_bin_price_selector = Selector::parse(".s-item__buy-it-now-price").unwrap();
|
|
|
|
for element in document.select(&item_selector) {
|
|
let raw_title_text = element
|
|
.select(&title_selector)
|
|
.next()
|
|
.map(|el| el.text().collect::<String>().trim().to_string());
|
|
let price_text = element
|
|
.select(&price_selector)
|
|
.next()
|
|
.map(|el| el.text().collect::<String>().trim().to_string());
|
|
|
|
let id = element
|
|
.select(&link_selector)
|
|
.next()
|
|
.and_then(|link_el| link_el.value().attr("href"))
|
|
.and_then(|href| ITEM_ID_REGEX.captures(href))
|
|
.and_then(|caps| caps.get(1).map(|m| m.as_str().to_string()))
|
|
.and_then(|id_str| id_str.parse::<i64>().ok());
|
|
|
|
if raw_title_text.is_none() || price_text.is_none() || id.is_none() {
|
|
warn!(
|
|
"Skipping {:?} due to missing title, price, or item ID.",
|
|
element
|
|
);
|
|
continue;
|
|
}
|
|
if id.unwrap() == 123456 {
|
|
info!("Skipping {:?} due to bogus ID of 123456", element);
|
|
continue;
|
|
}
|
|
|
|
let raw_title = raw_title_text.unwrap();
|
|
let price_text = price_text.unwrap();
|
|
|
|
let title = NEW_LISTING_REGEX.replace(&raw_title, "").trim().to_string();
|
|
|
|
let primary_display_price = parse_price(&price_text);
|
|
|
|
let mut current_bid_price: Option<f64> = None;
|
|
let mut final_buy_it_now_price: Option<f64> = None;
|
|
let mut item_is_auction = false;
|
|
|
|
if let Some(bid_el) = element.select(&bid_count_selector).next() {
|
|
if bid_el
|
|
.text()
|
|
.collect::<String>()
|
|
.to_lowercase()
|
|
.contains("bid")
|
|
{
|
|
item_is_auction = true;
|
|
}
|
|
}
|
|
|
|
let has_best_offer = element
|
|
.select(&primary_info_selector)
|
|
.any(|e| e.text().any(|e| e.to_lowercase().contains("or best offer")));
|
|
|
|
if item_is_auction {
|
|
current_bid_price = primary_display_price;
|
|
if let Some(bin_el) = element.select(&auction_bin_price_selector).next() {
|
|
final_buy_it_now_price = parse_price(&bin_el.text().collect::<String>());
|
|
}
|
|
} else {
|
|
final_buy_it_now_price = primary_display_price;
|
|
}
|
|
|
|
let image_url = element
|
|
.select(&image_selector)
|
|
.next()
|
|
.and_then(|img_el| {
|
|
img_el
|
|
.value()
|
|
.attr("data-src")
|
|
.or(img_el.value().attr("src"))
|
|
})
|
|
.map(|s| s.to_string())
|
|
.unwrap();
|
|
|
|
items.push((
|
|
Listing {
|
|
title,
|
|
id: 0,
|
|
item_id: id?,
|
|
buy_it_now_price: final_buy_it_now_price,
|
|
has_best_offer,
|
|
image_url,
|
|
},
|
|
ItemAppearances {
|
|
item: id?,
|
|
timestamp: *timestamp,
|
|
current_bid_usd_cents: current_bid_price.map(|b| (b * 100.0).round() as i64),
|
|
category: category.to_owned(),
|
|
},
|
|
));
|
|
}
|
|
Some(items)
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use similar_asserts::assert_eq;
|
|
|
|
#[test_log::test]
|
|
fn parse() {
|
|
let timestamp = chrono::DateTime::from_timestamp(1750369463, 0).unwrap();
|
|
let html = include_str!("../test_data/ebay_scraper/raw_scraped/ssd/1750369463.html");
|
|
let parsed = extract_data_from_html(html, ×tamp, "ssd").unwrap();
|
|
// assert_eq!(parsed.len(), 62);
|
|
|
|
let parsed = parsed.first_chunk::<10>().unwrap();
|
|
assert_eq!(
|
|
parsed[0],
|
|
(
|
|
Listing {
|
|
id: 0,
|
|
item_id: 388484391867,
|
|
title: "WD Blue 2.5-Inch 3D NAND SATA SSD 1TB - WDBNCE0010PNC-WRSN".to_string(),
|
|
buy_it_now_price: Some(59.99),
|
|
has_best_offer: true,
|
|
image_url: "https://i.ebayimg.com/images/g/wQYAAeSwOTtoN8SC/s-l500.webp"
|
|
.to_string()
|
|
},
|
|
ItemAppearances {
|
|
item: 388484391867,
|
|
timestamp: timestamp,
|
|
category: "ssd".to_owned(),
|
|
current_bid_usd_cents: None
|
|
}
|
|
)
|
|
);
|
|
assert_eq!(
|
|
parsed[4],
|
|
(Listing {
|
|
id: 0,
|
|
item_id: 286605201240,
|
|
title:
|
|
"Fanxiang M.2 SSD 1TB NVMe PCIe Gen 3x 4 M2 Internal Solid State Drive 3500MB/s"
|
|
.to_string(),
|
|
buy_it_now_price: None,
|
|
has_best_offer: true,
|
|
image_url: "https://i.ebayimg.com/images/g/3NoAAeSwPrtoDb1O/s-l500.webp"
|
|
.to_string()
|
|
}, ItemAppearances {
|
|
item: 286605201240,
|
|
timestamp: timestamp,
|
|
category: "ssd".to_owned(),
|
|
current_bid_usd_cents: Some(1260)
|
|
})
|
|
);
|
|
}
|
|
}
|