Initial terrible commit

This commit is contained in:
hak8or 2024-04-15 15:11:47 -04:00
commit 382185f623
7 changed files with 2277 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
/target

1834
Cargo.lock generated Normal file

File diff suppressed because it is too large Load Diff

13
Cargo.toml Normal file
View File

@ -0,0 +1,13 @@
[package]
name = "ebay_compute_scraper"
version = "0.1.0"
edition = "2021"
[dependencies]
scraper = "0.19.0"
log = "0.4.21"
anyhow = { version = "1.0.82", features = ["backtrace", "std"] }
tracing = "0.1.40"
tracing-subscriber = "0.3.18"
regex = "1.10.4"
reqwest = { version = "0.12.3", features = ["blocking"] }

File diff suppressed because one or more lines are too long

3
scrape.sh Executable file
View File

@ -0,0 +1,3 @@
#!/bin/env bash
wget 'https://www.ebay.com/sch/179/i.html?_from=R40&_nkw=ryzen&_sop=15&_blrs=recall_filtering' -O EbayScrape_ryzen_$(date +%s).html

48
src/ebay_fetcher.rs Normal file
View File

@ -0,0 +1,48 @@
use std::collections::HashMap;
use anyhow::{bail, Result};
use tracing::{info, error, instrument};
use std::sync::mpsc::{Sender, Receiver};
use std::sync::mpsc;
use std::thread;
use std::thread::Thread;
pub struct Context {
pub urls: HashMap<String, String>,
pub threads: Vec<std::thread::JoinHandle<Result<String, anyhow::Error>>>
}
impl Default for Context {
fn default() -> Self {
Self {
urls: HashMap::from([
(
"n100".to_string(),
"https://www.ebay.com/sch/i.html?_from=R40&_nkw=n100&_sacat=171957&_sop=15&_blrs=recall_filtering&_ipg=240".to_string()
),
(
"n305".to_string(),
"https://www.ebay.com/sch/i.html?_from=R40&_nkw=n305&_sacat=171957&_sop=15&_blrs=recall_filtering&_ipg=240".to_string()
)
]),
threads: Vec::new()
}
}
}
impl Context {
pub fn ebay_fetch_html(&mut self, filename: String, url: String, outdir_dir: String) -> Result<()> {
let thr = thread::spawn(move || {
// let body = reqwest::blocking::get(url)?.text()?;
if url.is_empty() {
Ok("hello".to_string())
} else {
bail!("hmmm")
}
});
self.threads.push(thr);
Ok(())
}
}

223
src/main.rs Normal file
View File

@ -0,0 +1,223 @@
mod ebay_fetcher;
use std::{fs::File, io::Read, path::PathBuf};
use scraper::{ElementRef, Html, Selector, selector::ToCss};
use anyhow::{bail, Result};
use tracing::{info, error, instrument};
use tracing_subscriber::FmtSubscriber;
use regex::Regex;
use crate::ebay_fetcher::Context;
#[allow(dead_code)]
#[derive(Debug, Default, Clone)]
struct ParsedFile {
filename: String,
listings: Vec<EbayResult>
}
#[allow(dead_code)]
#[derive(Debug, Clone)]
struct EbayResult {
pub price: f64,
pub shipping: f64,
pub title: String,
pub has_bids: bool,
pub allows_best_offer: bool,
pub item_num: u64,
}
#[instrument]
fn parse_ebay_results(filepath: PathBuf) -> Result<Vec<EbayResult>> {
let mut f = File::open(filepath).unwrap();
let mut contents = String::new();
f.read_to_string(&mut contents).unwrap();
let document = Html::parse_document(contents.as_str());
let selector = Selector::parse(".srp-results").unwrap();
let srp_results = document.select(&selector).next().unwrap();
let (elems, errs): (Vec<_>, Vec<_>) = get_items(&srp_results)?
.iter()
.map(parse_item)
.partition(Result::is_ok);
for err in errs {
error!("Hit error: {:?}", err);
}
Ok(elems.into_iter().map(Result::unwrap).collect())
}
#[instrument(skip_all)]
fn get_items<'a>(page: &'a ElementRef<'a>) -> Result<Vec<ElementRef<'a>>> {
let selector = Selector::parse(".s-item").unwrap();
let found = page.select(&selector);
Ok(found.collect())
}
#[instrument(skip_all)]
fn parse_item_title(binding: &ElementRef) -> Result<String> {
let selector = Selector::parse(".s-item__title").unwrap();
let found = binding.select(&selector);
let mut iter = found.into_iter();
if iter.clone().count() != 1 {
bail!("Expecting only a single title per result! Found:{:?}", iter.count());
}
let b1 = iter.next().unwrap().text().collect::<Vec<_>>();
let b2 = b1.iter().filter(|&e| *e != "New Listing").collect::<Vec<_>>();
if b2.len() != 1 {
bail!("Only expecting one title section per result!");
}
return Ok(b2.first().unwrap().to_string())
}
#[instrument(skip_all)]
fn parse_item_price(binding: &ElementRef) -> Result<f64> {
let selector = Selector::parse(".s-item__price").unwrap();
let found = binding.select(&selector);
let mut iter = found.into_iter();
if iter.clone().count() != 1 {
bail!("Expecting only a single price per result! Found:{}", iter.clone().count());
}
let b1 = iter.next().unwrap().text().collect::<Vec<_>>();
let b2 = b1.iter().filter(|&e| *e != "New Listing").collect::<Vec<_>>();
let mut price_str = match b2.len() {
1 => { *b1.first().unwrap() }
3 => {
if *b2[1] == " to " {
bail!("Ignoring ranged listings, range:{:?}", b2);
}
bail!("Found three elements in pricing but unexpected values:{:?}", b2);
}
_ => { bail!("Found unexpected pricing: {:?}", b2); }
};
price_str = price_str.trim_start_matches("$");
Ok(price_str.parse().unwrap())
}
#[instrument(skip_all)]
fn parse_item_shipping(binding: &ElementRef) -> Result<f64> {
let free_x_days_shipping = {
let selector = Selector::parse(".s-item__freeXDays").unwrap();
match binding.select(&selector).count() {
0 => { false }
1 => { true }
unknown => {
bail!("Expecting only a single item__freeXDays per result! Found:{}", unknown);
}
}
};
if free_x_days_shipping {
return Ok(0.00);
}
let selector = Selector::parse(".s-item__shipping").unwrap();
let found = binding.select(&selector);
let mut iter = found.into_iter();
if iter.clone().count() != 1 {
bail!("Expecting only a single shipping price per result! Found:{}", iter.clone().count());
}
let b1 = iter.next().unwrap().text().collect::<Vec<_>>();
if b1.len() != 1 {
bail!("Expected only a single shipping price per result! Found:{:?}", b1);
}
let price_str = *b1.first().unwrap();
if price_str == "Free shipping" {
return Ok(0.00);
}
let price_otherstr = Regex::new(r"\d+\.\d+")
.unwrap()
.find(price_str)
.unwrap().as_str();
Ok(price_otherstr.parse().unwrap())
}
#[instrument(skip_all)]
fn parse_item_ebay_itm(binding: &ElementRef) -> Result<u64> {
let selector = Selector::parse(".s-item__link").unwrap();
let found = binding.select(&selector);
let mut iter = found.into_iter();
if iter.clone().count() != 1 {
bail!("Expecting only a single item link per result! Found:{}", iter.clone().count());
}
let b1 = iter.next().unwrap().attr("href").into_iter().collect::<Vec<_>>();
if b1.len() != 1 {
bail!("Expected only a single item link per result! Found:{:?}", b1);
}
let mut url = *b1.first().unwrap();
url = url.trim_start_matches("https://www.ebay.com/itm/");
let price_otherstr = Regex::new(r"\d+")
.unwrap()
.find(url)
.unwrap().as_str();
Ok(price_otherstr.parse().unwrap())
}
#[instrument(skip_all)]
fn item_has_bids(binding: &ElementRef) -> bool {
let selector_bids = Selector::parse(".s-item__bids").unwrap();
// let selector_bidcount = Selector::parse("s-item__bidCount").unwrap();
match binding.select(&selector_bids).count() {
0 => { false }
1 => { true }
uhm => { error!("Found {} an unexpected {} times", selector_bids.to_css_string(), uhm); false }
}
}
fn item_has_bestoffer(binding: &ElementRef) -> bool {
let selector_bids = Selector::parse(".s-item__purchaseOptions").unwrap();
let elems: Vec<&str> = binding.select(&selector_bids)
.map(|e| e.text().collect::<Vec<_>>())
.collect::<Vec<_>>()
.into_iter().flatten()
.collect();
elems.contains(&"or Best Offer")
}
#[instrument(skip_all)]
fn parse_item(elem: &ElementRef) -> Result<EbayResult> {
Ok(EbayResult {
price: parse_item_price(elem)?,
title: parse_item_title(elem)?,
has_bids: item_has_bids(elem),
shipping: parse_item_shipping(elem)?,
item_num: parse_item_ebay_itm(elem)?,
allows_best_offer: item_has_bestoffer(elem)
})
}
fn main() -> Result<()>{
tracing::subscriber::set_global_default(
FmtSubscriber::builder()
.with_max_level(tracing::Level::INFO)
.finish()
)
.expect("setting default subscriber failed");
let parsed_files = vec![ParsedFile::default(); 0];
let mut ebay_ctx = ebay_fetcher::Context::default();
ebay_ctx.ebay_fetch_html("filename".to_string(), "someurl".to_string(), "somedir".to_string())?;
for e in parse_ebay_results(PathBuf::from("EbayScrape_ryzen_1713039640.html"))? {
info!("{:?}", e);
}
Ok(())
}