Initial rough commit
All checks were successful
Cargo Build & Test / Rust project - latest (1.88) (push) Successful in 3m30s
Cargo Build & Test / Rust project - latest (1.85.1) (push) Successful in 4m1s
Cargo Build & Test / Rust project - latest (1.87) (push) Successful in 4m5s
Cargo Build & Test / Rust project - latest (1.86) (push) Successful in 9m41s
All checks were successful
Cargo Build & Test / Rust project - latest (1.88) (push) Successful in 3m30s
Cargo Build & Test / Rust project - latest (1.85.1) (push) Successful in 4m1s
Cargo Build & Test / Rust project - latest (1.87) (push) Successful in 4m5s
Cargo Build & Test / Rust project - latest (1.86) (push) Successful in 9m41s
This commit is contained in:
209
src/parser_ebay.rs
Normal file
209
src/parser_ebay.rs
Normal file
@ -0,0 +1,209 @@
|
||||
use crate::db::Listing;
|
||||
use chrono::Utc;
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
use scraper::{Html, Selector};
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
lazy_static! {
|
||||
static ref PRICE_REGEX: Regex = Regex::new(r"\$?([\d,]+\.?\d*)").unwrap();
|
||||
static ref ITEM_ID_REGEX: Regex = Regex::new(r"/itm/(\d+)").unwrap();
|
||||
static ref NEW_LISTING_REGEX: Regex = Regex::new(r"(?i)^\s*NEW LISTING\s*[:\-\s]*").unwrap();
|
||||
}
|
||||
|
||||
/// Parses price from a string, taking the first price if it's a range.
|
||||
fn parse_price(price_text: &str) -> Option<f64> {
|
||||
let lower_price_text = price_text.to_lowercase();
|
||||
if lower_price_text.contains(" to ") {
|
||||
if let Some(first_part) = lower_price_text.split(" to ").next() {
|
||||
if let Some(caps) = PRICE_REGEX.captures(first_part) {
|
||||
if let Some(price_match) = caps.get(1) {
|
||||
info!("Price string:{:?} parsed!", price_match);
|
||||
return price_match.as_str().replace(',', "").parse().ok();
|
||||
}
|
||||
}
|
||||
}
|
||||
info!(
|
||||
"Price string:{:?} failed parsing with to, returning none.",
|
||||
price_text
|
||||
);
|
||||
return None;
|
||||
}
|
||||
|
||||
if let Some(caps) = PRICE_REGEX.captures(price_text) {
|
||||
if let Some(price_match) = caps.get(1) {
|
||||
let p = price_match.as_str().replace(',', "").parse().ok();
|
||||
debug!(
|
||||
"price regex passed, working on caps:{:?}, price_match:{:?}, p:{:?}",
|
||||
caps, price_match, p
|
||||
);
|
||||
return p;
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
"Price string:{:?} failed parsing, returning none.",
|
||||
price_text
|
||||
);
|
||||
None
|
||||
}
|
||||
|
||||
/// Extracts item data from HTML content.
|
||||
pub fn extract_data_from_html(
|
||||
html_content: &str,
|
||||
timestamp: &chrono::DateTime<Utc>,
|
||||
) -> Option<Vec<Listing>> {
|
||||
let document = Html::parse_document(html_content);
|
||||
let mut items = Vec::new();
|
||||
|
||||
let item_selector =
|
||||
Selector::parse("li.s-item, li.srp-results__item, div.s-item[role='listitem']").unwrap();
|
||||
let title_selector = Selector::parse(".s-item__title, .srp-results__title").unwrap();
|
||||
let price_selector = Selector::parse(".s-item__price").unwrap();
|
||||
let image_selector =
|
||||
Selector::parse(".s-item__image-wrapper img.s-item__image-img, .s-item__image img")
|
||||
.unwrap();
|
||||
let link_selector =
|
||||
Selector::parse("a.s-item__link[href*='/itm/'], .s-item__info > a[href*='/itm/']").unwrap();
|
||||
let bid_count_selector =
|
||||
Selector::parse(".s-item__bid-count, .s-item__bids, .s-item__bidCount").unwrap();
|
||||
let primary_info_selector = Selector::parse(".s-item__detail--primary").unwrap();
|
||||
let _secondary_info_selector = Selector::parse(".s-item__detail--secondary").unwrap();
|
||||
let auction_bin_price_selector = Selector::parse(".s-item__buy-it-now-price").unwrap();
|
||||
|
||||
for element in document.select(&item_selector) {
|
||||
let raw_title_text = element
|
||||
.select(&title_selector)
|
||||
.next()
|
||||
.map(|el| el.text().collect::<String>().trim().to_string());
|
||||
let price_text = element
|
||||
.select(&price_selector)
|
||||
.next()
|
||||
.map(|el| el.text().collect::<String>().trim().to_string());
|
||||
|
||||
let id = element
|
||||
.select(&link_selector)
|
||||
.next()
|
||||
.and_then(|link_el| link_el.value().attr("href"))
|
||||
.and_then(|href| ITEM_ID_REGEX.captures(href))
|
||||
.and_then(|caps| caps.get(1).map(|m| m.as_str().to_string()))
|
||||
.and_then(|id_str| id_str.parse::<i64>().ok());
|
||||
|
||||
if raw_title_text.is_none() || price_text.is_none() || id.is_none() {
|
||||
warn!(
|
||||
"Skipping {:?} due to missing title, price, or item ID.",
|
||||
element
|
||||
);
|
||||
continue;
|
||||
}
|
||||
if id.unwrap() == 123456 {
|
||||
info!("Skipping {:?} due to bogus ID of 123456", element);
|
||||
continue;
|
||||
}
|
||||
|
||||
let raw_title = raw_title_text.unwrap();
|
||||
let price_text = price_text.unwrap();
|
||||
|
||||
let title = NEW_LISTING_REGEX.replace(&raw_title, "").trim().to_string();
|
||||
|
||||
let primary_display_price = parse_price(&price_text);
|
||||
|
||||
let mut current_bid_price: Option<f64> = None;
|
||||
let mut final_buy_it_now_price: Option<f64> = None;
|
||||
let mut item_is_auction = false;
|
||||
|
||||
if let Some(bid_el) = element.select(&bid_count_selector).next() {
|
||||
if bid_el
|
||||
.text()
|
||||
.collect::<String>()
|
||||
.to_lowercase()
|
||||
.contains("bid")
|
||||
{
|
||||
item_is_auction = true;
|
||||
}
|
||||
}
|
||||
|
||||
let has_best_offer = element
|
||||
.select(&primary_info_selector)
|
||||
.any(|e| e.text().any(|e| e.to_lowercase().contains("or best offer")));
|
||||
|
||||
if item_is_auction {
|
||||
current_bid_price = primary_display_price;
|
||||
if let Some(bin_el) = element.select(&auction_bin_price_selector).next() {
|
||||
final_buy_it_now_price = parse_price(&bin_el.text().collect::<String>());
|
||||
}
|
||||
} else {
|
||||
final_buy_it_now_price = primary_display_price;
|
||||
}
|
||||
|
||||
let image_url = element
|
||||
.select(&image_selector)
|
||||
.next()
|
||||
.and_then(|img_el| {
|
||||
img_el
|
||||
.value()
|
||||
.attr("data-src")
|
||||
.or(img_el.value().attr("src"))
|
||||
})
|
||||
.map(|s| s.to_string())
|
||||
.unwrap();
|
||||
|
||||
items.push(Listing {
|
||||
title,
|
||||
id: 0,
|
||||
item_id: id?,
|
||||
added_time: *timestamp,
|
||||
current_bid_price,
|
||||
buy_it_now_price: final_buy_it_now_price,
|
||||
has_best_offer,
|
||||
image_url,
|
||||
});
|
||||
}
|
||||
Some(items)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use similar_asserts::assert_eq;
|
||||
|
||||
#[test_log::test]
|
||||
fn parse() {
|
||||
let timestamp = chrono::DateTime::from_timestamp(1750369463, 0).unwrap();
|
||||
let html = include_str!("../test_data/ebay_scraper/raw_scraped/ssd/1750369463.html");
|
||||
let parsed = extract_data_from_html(html, ×tamp).unwrap();
|
||||
// assert_eq!(parsed.len(), 62);
|
||||
|
||||
let parsed = parsed.first_chunk::<10>().unwrap();
|
||||
assert_eq!(
|
||||
parsed[0],
|
||||
Listing {
|
||||
id: 0,
|
||||
item_id: 388484391867,
|
||||
title: "WD Blue 2.5-Inch 3D NAND SATA SSD 1TB - WDBNCE0010PNC-WRSN".to_string(),
|
||||
added_time: timestamp,
|
||||
current_bid_price: None,
|
||||
buy_it_now_price: Some(59.99),
|
||||
has_best_offer: true,
|
||||
image_url: "https://i.ebayimg.com/images/g/wQYAAeSwOTtoN8SC/s-l500.webp"
|
||||
.to_string()
|
||||
}
|
||||
);
|
||||
assert_eq!(
|
||||
parsed[4],
|
||||
Listing {
|
||||
id: 0,
|
||||
item_id: 286605201240,
|
||||
title:
|
||||
"Fanxiang M.2 SSD 1TB NVMe PCIe Gen 3x 4 M2 Internal Solid State Drive 3500MB/s"
|
||||
.to_string(),
|
||||
added_time: timestamp,
|
||||
current_bid_price: Some(12.60),
|
||||
buy_it_now_price: None,
|
||||
has_best_offer: true,
|
||||
image_url: "https://i.ebayimg.com/images/g/3NoAAeSwPrtoDb1O/s-l500.webp"
|
||||
.to_string()
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user