Initial rough commit
This commit is contained in:
32
.github/workflows/ci.yml
vendored
Normal file
32
.github/workflows/ci.yml
vendored
Normal file
@ -0,0 +1,32 @@
|
||||
name: Cargo Build & Test
|
||||
|
||||
on:
|
||||
push:
|
||||
pull_request:
|
||||
|
||||
env:
|
||||
CARGO_TERM_COLOR: always
|
||||
|
||||
jobs:
|
||||
build_and_test:
|
||||
name: Rust project - latest
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
rust_ver: # Try to not let this blow up in size, lets keep this to ~4
|
||||
- 1.88
|
||||
- 1.87
|
||||
- 1.86
|
||||
- 1.85.1
|
||||
# Very ugly way to force cargo and jazz to be in PATH across run's but
|
||||
# lets fix this alter.
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- run: curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain none -y
|
||||
- run: echo "$HOME/.cargo/bin" >> $GITHUB_PATH
|
||||
- run: rustup update ${{ matrix.rust_ver }} && rustup default ${{ matrix.rust_ver }}
|
||||
- run: cargo build
|
||||
- run: cargo test
|
||||
- run: cargo build --release
|
||||
- run: cargo test --release
|
||||
- run: cargo bench
|
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
/target
|
2554
Cargo.lock
generated
Normal file
2554
Cargo.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
24
Cargo.toml
Normal file
24
Cargo.toml
Normal file
@ -0,0 +1,24 @@
|
||||
[package]
|
||||
name = "ebay_scraper_rust"
|
||||
version = "0.1.0"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
actix-web = "4.11.0"
|
||||
chrono = { version = "0.4.41", features = ["serde"] }
|
||||
clap = { version = "4.5.40", features = ["derive"] }
|
||||
dirs = "6.0.0"
|
||||
env_logger = "0.11.8"
|
||||
lazy_static = "1.5.0"
|
||||
log = "0.4.27"
|
||||
regex = "1.11.1"
|
||||
rusqlite = { version = "0.36.0", features = ["bundled", "chrono"] }
|
||||
scraper = "0.23.1"
|
||||
serde = { version = "1.0.219", features = ["derive"] }
|
||||
serde_json = "1.0.140"
|
||||
test-log = { version = "0.2.17", features = ["trace"] }
|
||||
tracing = "0.1.41"
|
||||
tracing-subscriber = "0.3.19"
|
||||
|
||||
[dev-dependencies]
|
||||
similar-asserts = "1.7.0"
|
349
src/db.rs
Normal file
349
src/db.rs
Normal file
@ -0,0 +1,349 @@
|
||||
use chrono::{DateTime, Utc};
|
||||
use rusqlite::Connection;
|
||||
use serde::Serialize;
|
||||
use std::path::Path;
|
||||
|
||||
#[derive(Serialize, Debug, PartialEq, Clone)]
|
||||
pub struct SearchURL {
|
||||
pub full_url: String,
|
||||
pub name: String,
|
||||
}
|
||||
impl SearchURL {
|
||||
pub fn initialized_db(conn: &Connection) {
|
||||
conn.execute(
|
||||
"CREATE TABLE IF NOT EXISTS SearchURLs (
|
||||
id INTEGER PRIMARY KEY,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
name TEXT NOT NULL UNIQUE
|
||||
)",
|
||||
(),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
pub fn lookup(conn: &Connection, name: &str) -> Option<Self> {
|
||||
let mut stmt = conn
|
||||
.prepare("SELECT * FROM SearchURLs WHERE name = ?")
|
||||
.ok()?;
|
||||
stmt.query_one([name], |row| {
|
||||
Ok(SearchURL {
|
||||
// id: row.get(0)?,
|
||||
full_url: row.get(1)?,
|
||||
name: row.get(2)?,
|
||||
})
|
||||
})
|
||||
.ok()
|
||||
}
|
||||
|
||||
pub fn add_or_update(&self, conn: &Connection) {
|
||||
let _ = conn
|
||||
.execute(
|
||||
"INSERT OR REPLACE INTO SearchURLs (name, url) VALUES (?1, ?2)",
|
||||
(&self.name, &self.full_url),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
pub fn names(conn: &Connection) -> Vec<String> {
|
||||
let mut stmt = conn.prepare("SELECT name FROM SearchURLs").ok().unwrap();
|
||||
stmt.query_map([], |row| Ok(row.get(0)))
|
||||
.ok()
|
||||
.unwrap()
|
||||
.map(|e| e.unwrap())
|
||||
.flatten()
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, PartialEq, Clone)]
|
||||
pub struct ParsedPage {
|
||||
pub timestamp: DateTime<Utc>,
|
||||
pub category: String,
|
||||
}
|
||||
impl ParsedPage {
|
||||
pub fn initialized_db(conn: &Connection) {
|
||||
conn.execute(
|
||||
"CREATE TABLE IF NOT EXISTS Pages_Parsed (
|
||||
id INTEGER PRIMARY KEY,
|
||||
category TEXT NOT NULL,
|
||||
timestamp INTEGER NOT NULL,
|
||||
UNIQUE(category, timestamp)
|
||||
FOREIGN KEY(category) REFERENCES SearchURLs(name)
|
||||
)",
|
||||
(),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
pub fn lookup_db(conn: &Connection, timestamp: DateTime<Utc>) -> Option<Self> {
|
||||
let mut stmt = conn
|
||||
.prepare("SELECT * FROM Pages_Parsed WHERE timestamp = ?")
|
||||
.ok()?;
|
||||
stmt.query_one([timestamp], |row| {
|
||||
Ok(ParsedPage {
|
||||
// id: row.get(0)?,
|
||||
category: row.get(1)?,
|
||||
timestamp: row.get(2)?,
|
||||
})
|
||||
})
|
||||
.ok()
|
||||
}
|
||||
|
||||
pub fn add_or_update_db(&self, conn: &Connection) {
|
||||
let _ = conn
|
||||
.execute(
|
||||
"INSERT OR REPLACE INTO Pages_Parsed (category, timestamp) VALUES (?1, ?2)",
|
||||
(&self.category, self.timestamp),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, PartialEq, Copy, Clone)]
|
||||
pub struct ParsedStorage {
|
||||
pub id: i64,
|
||||
pub item: i64,
|
||||
pub total_gigabytes: i64,
|
||||
pub quantity: i64,
|
||||
pub individual_size_gigabytes: i64,
|
||||
pub parse_engine: i64,
|
||||
pub needed_description_check: bool,
|
||||
}
|
||||
impl ParsedStorage {
|
||||
pub fn initialized_db(conn: &Connection) {
|
||||
conn.execute(
|
||||
"CREATE TABLE IF NOT EXISTS Storage_Parsed (
|
||||
id INTEGER PRIMARY KEY,
|
||||
item INTEGER,
|
||||
total_gigabytes INTEGER,
|
||||
quantity INTEGER,
|
||||
sizes_gigabytes TEXT,
|
||||
parse_engine INTEGER,
|
||||
need_description_check INTEGER,
|
||||
UNIQUE(item, parse_engine)
|
||||
FOREIGN KEY(item) REFERENCES Ebay_Items(item_id)
|
||||
)",
|
||||
(),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
pub fn lookup_db(conn: &Connection, item: i64) -> Vec<ParsedStorage> {
|
||||
let mut stmt = conn
|
||||
.prepare("SELECT * FROM Storage_Parsed WHERE id = ?")
|
||||
.ok()
|
||||
.unwrap();
|
||||
stmt.query_map([item], |row| {
|
||||
Ok(ParsedStorage {
|
||||
id: row.get(0)?,
|
||||
item: row.get(1)?,
|
||||
total_gigabytes: row.get(2)?,
|
||||
quantity: row.get(3)?,
|
||||
individual_size_gigabytes: {
|
||||
let r: String = row.get(4)?;
|
||||
r.parse().unwrap()
|
||||
},
|
||||
parse_engine: row.get(5)?,
|
||||
needed_description_check: row.get(6)?,
|
||||
})
|
||||
})
|
||||
.ok()
|
||||
.unwrap()
|
||||
.map(|e| e.unwrap())
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn add_or_update_db(&self, conn: &Connection) {
|
||||
let _ = conn.execute("
|
||||
INSERT OR REPLACE INTO Storage_Parsed (item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, need_description_check)
|
||||
VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
|
||||
(
|
||||
&self.item,
|
||||
self.total_gigabytes,
|
||||
self.quantity,
|
||||
self.individual_size_gigabytes.to_string(),
|
||||
self.parse_engine,
|
||||
self.needed_description_check
|
||||
)
|
||||
).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, PartialEq, Clone)]
|
||||
pub struct Listing {
|
||||
pub id: i64,
|
||||
pub item_id: i64,
|
||||
pub title: String,
|
||||
pub added_time: DateTime<Utc>,
|
||||
pub current_bid_price: Option<f64>,
|
||||
pub buy_it_now_price: Option<f64>,
|
||||
pub has_best_offer: bool,
|
||||
pub image_url: String,
|
||||
}
|
||||
impl Listing {
|
||||
fn initialized(conn: &Connection) {
|
||||
conn.execute(
|
||||
"CREATE TABLE IF NOT EXISTS Ebay_Items (
|
||||
id INTEGER PRIMARY KEY,
|
||||
item_id INTEGER NOT NULL UNIQUE,
|
||||
title TEXT NOT NULL,
|
||||
added_time INTEGER NOT NULL,
|
||||
current_bid_usd_cents INTEGER,
|
||||
buy_it_now_usd_cents INTEGER,
|
||||
has_best_offer INTEGER NOT NULL,
|
||||
image_url TEXT NOT NULL
|
||||
)",
|
||||
(),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
pub fn lookup(conn: &Connection, item_id: i64) -> Option<Listing> {
|
||||
let mut stmt = conn
|
||||
.prepare("SELECT * FROM Ebay_Items WHERE item_id = ?")
|
||||
.ok()?;
|
||||
stmt.query_one([item_id], |row| {
|
||||
Ok(Listing {
|
||||
id: row.get(0)?,
|
||||
item_id: row.get(1)?,
|
||||
title: row.get(2)?,
|
||||
added_time: row.get(3)?,
|
||||
current_bid_price: row.get(4)?,
|
||||
buy_it_now_price: row.get(5)?,
|
||||
has_best_offer: row.get(6)?,
|
||||
image_url: row.get(7)?,
|
||||
})
|
||||
})
|
||||
.ok()
|
||||
}
|
||||
|
||||
pub fn lookup_since(conn: &Connection, since: i64, limit: i64) -> Vec<Self> {
|
||||
let mut stmt = conn
|
||||
.prepare(
|
||||
"SELECT * FROM Ebay_Items
|
||||
WHERE added_time >= ?1
|
||||
ORDER BY added_time
|
||||
LIMIT ?2
|
||||
",
|
||||
)
|
||||
.ok()
|
||||
.unwrap();
|
||||
|
||||
stmt.query_map([since, limit], |row| {
|
||||
Ok(Listing {
|
||||
id: row.get(0)?,
|
||||
item_id: row.get(1)?,
|
||||
title: row.get(2)?,
|
||||
added_time: row.get(3)?,
|
||||
current_bid_price: row.get(4)?,
|
||||
buy_it_now_price: row.get(5)?,
|
||||
has_best_offer: row.get(6)?,
|
||||
image_url: row.get(7)?,
|
||||
})
|
||||
})
|
||||
.ok()
|
||||
.unwrap()
|
||||
.map(|e| e.unwrap())
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn lookup_non_parsed(conn: &Connection) -> Vec<(i64, String)> {
|
||||
let mut stmt = conn
|
||||
.prepare(
|
||||
"
|
||||
SELECT ei.item_id, ei.title FROM Ebay_Items AS ei
|
||||
LEFT JOIN Storage_Parsed AS sp ON ei.item_id = sp.item
|
||||
WHERE sp.item IS NULL",
|
||||
)
|
||||
.ok()
|
||||
.unwrap();
|
||||
stmt.query_map([], |row| Ok((row.get(0)?, row.get(1)?)))
|
||||
.ok()
|
||||
.unwrap()
|
||||
.map(|e| e.unwrap())
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn add_or_update(&self, conn: &Connection) {
|
||||
let count = conn.execute(
|
||||
"INSERT OR REPLACE INTO Ebay_Items (item_id, title, added_time, current_bid_usd_cents, buy_it_now_usd_cents, has_best_offer, image_url)
|
||||
VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
|
||||
(
|
||||
self.item_id,
|
||||
&self.title,
|
||||
self.added_time,
|
||||
self.current_bid_price,
|
||||
self.buy_it_now_price,
|
||||
self.has_best_offer,
|
||||
self.image_url.clone()
|
||||
)
|
||||
).unwrap();
|
||||
if count != 1 {
|
||||
panic!("Expected count to be 1 but got {}", count);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_initialized(path: Option<&Path>) -> Connection {
|
||||
let conn = match path {
|
||||
Some(p) => Connection::open(&p),
|
||||
None => Connection::open_in_memory(),
|
||||
}
|
||||
.unwrap();
|
||||
|
||||
SearchURL::initialized_db(&conn);
|
||||
Listing::initialized(&conn);
|
||||
ParsedStorage::initialized_db(&conn);
|
||||
ParsedPage::initialized_db(&conn);
|
||||
|
||||
conn
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn sanity_check() {
|
||||
let db = get_initialized(None);
|
||||
|
||||
let searchurl = SearchURL {
|
||||
full_url: "google".to_owned(),
|
||||
name: "ssd".to_owned(),
|
||||
};
|
||||
searchurl.add_or_update(&db);
|
||||
assert_eq!(SearchURL::lookup(&db, &searchurl.name), Some(searchurl));
|
||||
|
||||
let listing = Listing {
|
||||
id: 1,
|
||||
item_id: 1234,
|
||||
title: "Some Title".to_string(),
|
||||
added_time: std::time::SystemTime::now().into(),
|
||||
current_bid_price: Some(0.12),
|
||||
buy_it_now_price: Some(1.23),
|
||||
has_best_offer: false,
|
||||
image_url: "google.com".to_string(),
|
||||
};
|
||||
listing.add_or_update(&db);
|
||||
assert_eq!(Listing::lookup(&db, listing.item_id), Some(listing.clone()));
|
||||
|
||||
let parsed = ParsedStorage {
|
||||
id: 1,
|
||||
item: 1234,
|
||||
total_gigabytes: 13,
|
||||
quantity: 3,
|
||||
individual_size_gigabytes: 13,
|
||||
parse_engine: 9,
|
||||
needed_description_check: true,
|
||||
};
|
||||
parsed.add_or_update_db(&db);
|
||||
assert_eq!(ParsedStorage::lookup_db(&db, listing.id), vec![parsed]);
|
||||
|
||||
let page = ParsedPage {
|
||||
category: "ssd".to_owned(),
|
||||
timestamp: std::time::SystemTime::now().into(),
|
||||
};
|
||||
page.add_or_update_db(&db);
|
||||
assert_eq!(ParsedPage::lookup_db(&db, page.timestamp), Some(page));
|
||||
}
|
||||
}
|
4
src/lib.rs
Normal file
4
src/lib.rs
Normal file
@ -0,0 +1,4 @@
|
||||
pub mod db;
|
||||
pub mod parser;
|
||||
pub mod parser_ebay;
|
||||
pub mod parser_storage;
|
190
src/main.rs
Normal file
190
src/main.rs
Normal file
@ -0,0 +1,190 @@
|
||||
use actix_web::{App, HttpServer, Responder, Result, get, post, web, web::Data};
|
||||
use clap::Parser;
|
||||
use ebay_scraper_rust::db::{Listing, ParsedPage, ParsedStorage, SearchURL, get_initialized};
|
||||
use ebay_scraper_rust::{parser_ebay, parser_storage};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Mutex;
|
||||
use tracing::info;
|
||||
|
||||
mod xdg_dirs;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[clap(
|
||||
name = "ebay-scraper-rust",
|
||||
version = "0.1.0",
|
||||
about = "Scrapes eBay search results for homelab purposes"
|
||||
)]
|
||||
struct Args {}
|
||||
|
||||
#[get("/page/{timestamp}")]
|
||||
async fn page_get(
|
||||
db: Data<Mutex<rusqlite::Connection>>,
|
||||
timestamp: web::Path<i64>,
|
||||
) -> Result<impl Responder> {
|
||||
Ok(web::Json(ParsedPage::lookup_db(
|
||||
&db.lock().unwrap(),
|
||||
chrono::DateTime::from_timestamp(*timestamp, 0).unwrap(),
|
||||
)))
|
||||
}
|
||||
|
||||
#[get("/listing/{id}")]
|
||||
async fn listing_get(
|
||||
db: Data<Mutex<rusqlite::Connection>>,
|
||||
id: web::Path<i64>,
|
||||
) -> Result<impl Responder> {
|
||||
Ok(web::Json(Listing::lookup(&db.lock().unwrap(), *id)))
|
||||
}
|
||||
|
||||
#[get("/listing/since/{timestamp}/{limit}")]
|
||||
async fn listing_since_get(
|
||||
db: Data<Mutex<rusqlite::Connection>>,
|
||||
req: web::Path<(i64, i64)>,
|
||||
) -> Result<impl Responder> {
|
||||
Ok(web::Json(Listing::lookup_since(
|
||||
&db.lock().unwrap(),
|
||||
req.0,
|
||||
req.1,
|
||||
)))
|
||||
}
|
||||
|
||||
#[post("listing/parse")]
|
||||
async fn parse_listings(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
|
||||
let mut cnt = 0;
|
||||
let db_unlocked = db.lock().unwrap();
|
||||
Listing::lookup_non_parsed(&db_unlocked)
|
||||
.iter()
|
||||
.map(|l| parser_storage::parse_size_and_quantity(l.0, &l.1))
|
||||
.inspect(|_| cnt = cnt + 1)
|
||||
.for_each(|ps| ps.add_or_update_db(&db_unlocked));
|
||||
|
||||
Ok(web::Json(cnt))
|
||||
}
|
||||
|
||||
#[get("listing/parse/{id}")]
|
||||
async fn listing_parse_get(
|
||||
db: Data<Mutex<rusqlite::Connection>>,
|
||||
id: web::Path<i64>,
|
||||
) -> Result<impl Responder> {
|
||||
Ok(web::Json(ParsedStorage::lookup_db(
|
||||
&db.lock().unwrap(),
|
||||
*id,
|
||||
)))
|
||||
}
|
||||
|
||||
pub fn timestamps_from_dir(path: &Path) -> Vec<i64> {
|
||||
if !std::fs::exists(path).expect("Directory must exist") {
|
||||
panic!(
|
||||
"Directory {:?} does not exist, cannot grab timestamps from there.",
|
||||
path
|
||||
);
|
||||
}
|
||||
|
||||
std::fs::read_dir(path)
|
||||
.unwrap()
|
||||
.inspect(|fpath| info!("Found {:?}", fpath))
|
||||
.map(|fpath| fpath.unwrap().path())
|
||||
.filter_map(|fstem| {
|
||||
fstem
|
||||
.file_stem()
|
||||
.and_then(|s| s.to_str())
|
||||
.expect("Invalid file name")
|
||||
.parse()
|
||||
.ok()
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[post("page/parse/{category}")]
|
||||
async fn parse_post(
|
||||
db: Data<Mutex<rusqlite::Connection>>,
|
||||
downloaddir: Data<PathBuf>,
|
||||
category: web::Path<String>,
|
||||
) -> Result<impl Responder> {
|
||||
let dir = &downloaddir.join(category.clone());
|
||||
|
||||
// Ensure the category is created.
|
||||
let url: serde_json::Value =
|
||||
serde_json::from_str(&std::fs::read_to_string(dir.join("url.json")).unwrap()).unwrap();
|
||||
info!("{:?}", url);
|
||||
let su = SearchURL {
|
||||
full_url: url.to_string(),
|
||||
name: category.to_string(),
|
||||
};
|
||||
info!("{:?}", su);
|
||||
su.add_or_update(&db.lock().unwrap());
|
||||
|
||||
let added: u64 = timestamps_from_dir(dir)
|
||||
.iter()
|
||||
.filter(|t| {
|
||||
info!("Checking for the existance of page {t}");
|
||||
let p = ParsedPage::lookup_db(
|
||||
&db.lock().unwrap(),
|
||||
chrono::DateTime::from_timestamp(**t, 0).unwrap(),
|
||||
);
|
||||
|
||||
// Timestamp never seen before, lets pass it on.
|
||||
if p.is_none() {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Timestamp was seen before *and* from the same catagory, don't pass
|
||||
// it on.
|
||||
if p.unwrap().category == *category {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
})
|
||||
.map(|t| {
|
||||
let timestamp = chrono::DateTime::from_timestamp(*t, 0).unwrap();
|
||||
info!("Adding or updating db with timestamp:{timestamp} catagory:{category}");
|
||||
ParsedPage {
|
||||
timestamp: timestamp,
|
||||
category: category.to_string(),
|
||||
}
|
||||
.add_or_update_db(&db.lock().unwrap());
|
||||
|
||||
let mut cnt = 0;
|
||||
parser_ebay::extract_data_from_html(
|
||||
&std::fs::read_to_string(dir.join(format!("{t}.html"))).unwrap(),
|
||||
×tamp,
|
||||
)
|
||||
.unwrap()
|
||||
.iter()
|
||||
.for_each(|l| {
|
||||
cnt = cnt + 1;
|
||||
l.add_or_update(&db.lock().unwrap());
|
||||
info!("Inserting id:{}, title:{}", l.item_id, l.title);
|
||||
});
|
||||
cnt
|
||||
})
|
||||
.sum();
|
||||
|
||||
Ok(added.to_string())
|
||||
}
|
||||
|
||||
#[actix_web::main]
|
||||
async fn main() -> std::io::Result<()> {
|
||||
env_logger::init_from_env(env_logger::Env::new().default_filter_or("info"));
|
||||
let _ = Args::parse();
|
||||
|
||||
let scrapedatadir = xdg_dirs::ensure_scrapedata_dir_exists("ebay_scraper", None);
|
||||
info!(
|
||||
"Starting with scraped data dir of \"{}\".",
|
||||
scrapedatadir.to_str().unwrap()
|
||||
);
|
||||
let db_mutex = Data::new(Mutex::new(get_initialized(None)));
|
||||
|
||||
HttpServer::new(move || {
|
||||
App::new()
|
||||
.service(page_get)
|
||||
.service(listing_get)
|
||||
.service(listing_since_get)
|
||||
.service(parse_post)
|
||||
.service(parse_listings)
|
||||
.app_data(db_mutex.clone())
|
||||
.app_data(Data::new(scrapedatadir.clone()))
|
||||
})
|
||||
.bind(("127.0.0.1", 8080))?
|
||||
.run()
|
||||
.await
|
||||
}
|
1
src/parser.rs
Normal file
1
src/parser.rs
Normal file
@ -0,0 +1 @@
|
||||
|
209
src/parser_ebay.rs
Normal file
209
src/parser_ebay.rs
Normal file
@ -0,0 +1,209 @@
|
||||
use crate::db::Listing;
|
||||
use chrono::Utc;
|
||||
use lazy_static::lazy_static;
|
||||
use regex::Regex;
|
||||
use scraper::{Html, Selector};
|
||||
use tracing::{debug, info, warn};
|
||||
|
||||
lazy_static! {
|
||||
static ref PRICE_REGEX: Regex = Regex::new(r"\$?([\d,]+\.?\d*)").unwrap();
|
||||
static ref ITEM_ID_REGEX: Regex = Regex::new(r"/itm/(\d+)").unwrap();
|
||||
static ref NEW_LISTING_REGEX: Regex = Regex::new(r"(?i)^\s*NEW LISTING\s*[:\-\s]*").unwrap();
|
||||
}
|
||||
|
||||
/// Parses price from a string, taking the first price if it's a range.
|
||||
fn parse_price(price_text: &str) -> Option<f64> {
|
||||
let lower_price_text = price_text.to_lowercase();
|
||||
if lower_price_text.contains(" to ") {
|
||||
if let Some(first_part) = lower_price_text.split(" to ").next() {
|
||||
if let Some(caps) = PRICE_REGEX.captures(first_part) {
|
||||
if let Some(price_match) = caps.get(1) {
|
||||
info!("Price string:{:?} parsed!", price_match);
|
||||
return price_match.as_str().replace(',', "").parse().ok();
|
||||
}
|
||||
}
|
||||
}
|
||||
info!(
|
||||
"Price string:{:?} failed parsing with to, returning none.",
|
||||
price_text
|
||||
);
|
||||
return None;
|
||||
}
|
||||
|
||||
if let Some(caps) = PRICE_REGEX.captures(price_text) {
|
||||
if let Some(price_match) = caps.get(1) {
|
||||
let p = price_match.as_str().replace(',', "").parse().ok();
|
||||
debug!(
|
||||
"price regex passed, working on caps:{:?}, price_match:{:?}, p:{:?}",
|
||||
caps, price_match, p
|
||||
);
|
||||
return p;
|
||||
}
|
||||
}
|
||||
|
||||
info!(
|
||||
"Price string:{:?} failed parsing, returning none.",
|
||||
price_text
|
||||
);
|
||||
None
|
||||
}
|
||||
|
||||
/// Extracts item data from HTML content.
|
||||
pub fn extract_data_from_html(
|
||||
html_content: &str,
|
||||
timestamp: &chrono::DateTime<Utc>,
|
||||
) -> Option<Vec<Listing>> {
|
||||
let document = Html::parse_document(html_content);
|
||||
let mut items = Vec::new();
|
||||
|
||||
let item_selector =
|
||||
Selector::parse("li.s-item, li.srp-results__item, div.s-item[role='listitem']").unwrap();
|
||||
let title_selector = Selector::parse(".s-item__title, .srp-results__title").unwrap();
|
||||
let price_selector = Selector::parse(".s-item__price").unwrap();
|
||||
let image_selector =
|
||||
Selector::parse(".s-item__image-wrapper img.s-item__image-img, .s-item__image img")
|
||||
.unwrap();
|
||||
let link_selector =
|
||||
Selector::parse("a.s-item__link[href*='/itm/'], .s-item__info > a[href*='/itm/']").unwrap();
|
||||
let bid_count_selector =
|
||||
Selector::parse(".s-item__bid-count, .s-item__bids, .s-item__bidCount").unwrap();
|
||||
let primary_info_selector = Selector::parse(".s-item__detail--primary").unwrap();
|
||||
let _secondary_info_selector = Selector::parse(".s-item__detail--secondary").unwrap();
|
||||
let auction_bin_price_selector = Selector::parse(".s-item__buy-it-now-price").unwrap();
|
||||
|
||||
for element in document.select(&item_selector) {
|
||||
let raw_title_text = element
|
||||
.select(&title_selector)
|
||||
.next()
|
||||
.map(|el| el.text().collect::<String>().trim().to_string());
|
||||
let price_text = element
|
||||
.select(&price_selector)
|
||||
.next()
|
||||
.map(|el| el.text().collect::<String>().trim().to_string());
|
||||
|
||||
let id = element
|
||||
.select(&link_selector)
|
||||
.next()
|
||||
.and_then(|link_el| link_el.value().attr("href"))
|
||||
.and_then(|href| ITEM_ID_REGEX.captures(href))
|
||||
.and_then(|caps| caps.get(1).map(|m| m.as_str().to_string()))
|
||||
.and_then(|id_str| id_str.parse::<i64>().ok());
|
||||
|
||||
if raw_title_text.is_none() || price_text.is_none() || id.is_none() {
|
||||
warn!(
|
||||
"Skipping {:?} due to missing title, price, or item ID.",
|
||||
element
|
||||
);
|
||||
continue;
|
||||
}
|
||||
if id.unwrap() == 123456 {
|
||||
info!("Skipping {:?} due to bogus ID of 123456", element);
|
||||
continue;
|
||||
}
|
||||
|
||||
let raw_title = raw_title_text.unwrap();
|
||||
let price_text = price_text.unwrap();
|
||||
|
||||
let title = NEW_LISTING_REGEX.replace(&raw_title, "").trim().to_string();
|
||||
|
||||
let primary_display_price = parse_price(&price_text);
|
||||
|
||||
let mut current_bid_price: Option<f64> = None;
|
||||
let mut final_buy_it_now_price: Option<f64> = None;
|
||||
let mut item_is_auction = false;
|
||||
|
||||
if let Some(bid_el) = element.select(&bid_count_selector).next() {
|
||||
if bid_el
|
||||
.text()
|
||||
.collect::<String>()
|
||||
.to_lowercase()
|
||||
.contains("bid")
|
||||
{
|
||||
item_is_auction = true;
|
||||
}
|
||||
}
|
||||
|
||||
let has_best_offer = element
|
||||
.select(&primary_info_selector)
|
||||
.any(|e| e.text().any(|e| e.to_lowercase().contains("or best offer")));
|
||||
|
||||
if item_is_auction {
|
||||
current_bid_price = primary_display_price;
|
||||
if let Some(bin_el) = element.select(&auction_bin_price_selector).next() {
|
||||
final_buy_it_now_price = parse_price(&bin_el.text().collect::<String>());
|
||||
}
|
||||
} else {
|
||||
final_buy_it_now_price = primary_display_price;
|
||||
}
|
||||
|
||||
let image_url = element
|
||||
.select(&image_selector)
|
||||
.next()
|
||||
.and_then(|img_el| {
|
||||
img_el
|
||||
.value()
|
||||
.attr("data-src")
|
||||
.or(img_el.value().attr("src"))
|
||||
})
|
||||
.map(|s| s.to_string())
|
||||
.unwrap();
|
||||
|
||||
items.push(Listing {
|
||||
title,
|
||||
id: 0,
|
||||
item_id: id?,
|
||||
added_time: *timestamp,
|
||||
current_bid_price,
|
||||
buy_it_now_price: final_buy_it_now_price,
|
||||
has_best_offer,
|
||||
image_url,
|
||||
});
|
||||
}
|
||||
Some(items)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use similar_asserts::assert_eq;
|
||||
|
||||
#[test_log::test]
|
||||
fn parse() {
|
||||
let timestamp = chrono::DateTime::from_timestamp(1750369463, 0).unwrap();
|
||||
let html = include_str!("../test_data/ebay_scraper/raw_scraped/ssd/1750369463.html");
|
||||
let parsed = extract_data_from_html(html, ×tamp).unwrap();
|
||||
// assert_eq!(parsed.len(), 62);
|
||||
|
||||
let parsed = parsed.first_chunk::<10>().unwrap();
|
||||
assert_eq!(
|
||||
parsed[0],
|
||||
Listing {
|
||||
id: 0,
|
||||
item_id: 388484391867,
|
||||
title: "WD Blue 2.5-Inch 3D NAND SATA SSD 1TB - WDBNCE0010PNC-WRSN".to_string(),
|
||||
added_time: timestamp,
|
||||
current_bid_price: None,
|
||||
buy_it_now_price: Some(59.99),
|
||||
has_best_offer: true,
|
||||
image_url: "https://i.ebayimg.com/images/g/wQYAAeSwOTtoN8SC/s-l500.webp"
|
||||
.to_string()
|
||||
}
|
||||
);
|
||||
assert_eq!(
|
||||
parsed[4],
|
||||
Listing {
|
||||
id: 0,
|
||||
item_id: 286605201240,
|
||||
title:
|
||||
"Fanxiang M.2 SSD 1TB NVMe PCIe Gen 3x 4 M2 Internal Solid State Drive 3500MB/s"
|
||||
.to_string(),
|
||||
added_time: timestamp,
|
||||
current_bid_price: Some(12.60),
|
||||
buy_it_now_price: None,
|
||||
has_best_offer: true,
|
||||
image_url: "https://i.ebayimg.com/images/g/3NoAAeSwPrtoDb1O/s-l500.webp"
|
||||
.to_string()
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
208
src/parser_storage.rs
Normal file
208
src/parser_storage.rs
Normal file
@ -0,0 +1,208 @@
|
||||
use crate::db::ParsedStorage;
|
||||
use lazy_static::lazy_static;
|
||||
use regex::*;
|
||||
|
||||
// let parsed_size_info = crate::parser_storage::parse_size_and_quantity(&cleaned_title);
|
||||
// let _cost_per_tb = if let Some(price) = primary_display_price {
|
||||
// if parsed_size_info.total_tb > 0.0 {
|
||||
// Some(((price / parsed_size_info.total_tb) * 100.0).round() / 100.0)
|
||||
// } else {
|
||||
// None
|
||||
// }
|
||||
// } else {
|
||||
// None
|
||||
// };
|
||||
|
||||
lazy_static! {
|
||||
static ref EXPLICIT_QTY_PATTERNS: Vec<Regex> = vec![
|
||||
Regex::new(r"\b(?:LOT\s+OF|LOT)\s*\(?\s*(\d+)\s*\)?").unwrap(),
|
||||
Regex::new(r"\b(?:LOT\s+OF|LOT)\s*\*\s*(\d+)").unwrap(),
|
||||
Regex::new(r"\b(?:PACK\s+OF|PACK|BULK)\s*\(?\s*(\d+)\s*\)?").unwrap(),
|
||||
Regex::new(r"\b(\d+)\s*-\s*PACK\b").unwrap(),
|
||||
Regex::new(r"\b(\d+)\s*COUNT\b").unwrap(),
|
||||
];
|
||||
static ref SIZE_REGEX: Regex = Regex::new(r"(\d+(?:\.\d+)?)\s*(TB|GB)\b").unwrap();
|
||||
static ref SIZE_RANGE_REGEX: Regex =
|
||||
Regex::new(r"\d+(?:\.\d+)?\s*(?:GB|TB)\s*(?:-|&|OR|TO)\s*\d+(?:\.\d+)?\s*(?:GB|TB)")
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
/// Parses size and quantity information from an item title.
|
||||
pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage {
|
||||
let upper_title = title.to_uppercase();
|
||||
let mut total_gb = 0i64;
|
||||
let mut quantity = 1i64;
|
||||
let mut needed_description_check = false;
|
||||
let mut individual_size_gb = 0i64;
|
||||
|
||||
for pattern in EXPLICIT_QTY_PATTERNS.iter() {
|
||||
if let Some(caps) = pattern.captures(&upper_title) {
|
||||
if let Some(qty_match) = caps.get(1) {
|
||||
if let Ok(parsed_qty) = qty_match.as_str().parse::<i64>() {
|
||||
if parsed_qty > 0 && parsed_qty < 500 {
|
||||
quantity = parsed_qty;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut size_matches: Vec<(f64, String)> = Vec::new();
|
||||
for caps in SIZE_REGEX.captures_iter(&upper_title) {
|
||||
if let (Some(val_str), Some(unit_str)) = (caps.get(1), caps.get(2)) {
|
||||
if let Ok(val) = val_str.as_str().parse::<f64>() {
|
||||
size_matches.push((val, unit_str.as_str().to_string()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !size_matches.is_empty() {
|
||||
let mut unique_sizes_gb: Vec<i64> = size_matches
|
||||
.iter()
|
||||
.map(|(val, unit)| { if unit == "TB" { *val * 1024.0 } else { *val } } as i64)
|
||||
.collect();
|
||||
unique_sizes_gb.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
|
||||
unique_sizes_gb.dedup();
|
||||
|
||||
if !unique_sizes_gb.is_empty() {
|
||||
individual_size_gb = unique_sizes_gb[0];
|
||||
if unique_sizes_gb.len() > 1 {
|
||||
needed_description_check = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if SIZE_RANGE_REGEX.is_match(&upper_title) {
|
||||
needed_description_check = true;
|
||||
}
|
||||
if quantity > 1 && upper_title.contains("MIXED") {
|
||||
needed_description_check = true;
|
||||
}
|
||||
if upper_title.contains("CHECK THE DESCRIPTION")
|
||||
|| upper_title.contains("CHECK DESCRIPTION")
|
||||
|| upper_title.contains("SEE DESCRIPTION")
|
||||
{
|
||||
if quantity > 1 || size_matches.is_empty() || size_matches.len() > 1 {
|
||||
needed_description_check = true;
|
||||
}
|
||||
}
|
||||
|
||||
if individual_size_gb > 0 {
|
||||
total_gb = individual_size_gb * quantity;
|
||||
}
|
||||
|
||||
if quantity > 1 && total_gb == 0 && !size_matches.is_empty() {
|
||||
needed_description_check = true;
|
||||
}
|
||||
|
||||
if quantity == 1 && size_matches.len() == 1 && !needed_description_check {
|
||||
// This condition is implicitly handled
|
||||
}
|
||||
|
||||
ParsedStorage {
|
||||
id: 0,
|
||||
item: item_id,
|
||||
total_gigabytes: total_gb,
|
||||
quantity,
|
||||
individual_size_gigabytes: individual_size_gb,
|
||||
needed_description_check,
|
||||
parse_engine: 0,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use similar_asserts::assert_eq;
|
||||
|
||||
static TESTS: &[(&str, ParsedStorage, bool)] = &[
|
||||
(
|
||||
"Lot Of 3 Western Digital PC SN740 512GB M.2 2230 NVMe Internal SSD",
|
||||
ParsedStorage {
|
||||
id: 0,
|
||||
item: 0,
|
||||
total_gigabytes: 512 * 3,
|
||||
quantity: 3,
|
||||
individual_size_gigabytes: 512,
|
||||
parse_engine: 0,
|
||||
needed_description_check: false,
|
||||
},
|
||||
true,
|
||||
),
|
||||
(
|
||||
"Samsung SM883 2.5†240GB SATA 6Gbps MZ7KH240HAHQ-00005",
|
||||
ParsedStorage {
|
||||
id: 0,
|
||||
item: 1,
|
||||
total_gigabytes: 240,
|
||||
quantity: 1,
|
||||
individual_size_gigabytes: 240,
|
||||
parse_engine: 0,
|
||||
needed_description_check: false,
|
||||
},
|
||||
true,
|
||||
),
|
||||
(
|
||||
"1TB AData SU650 2.5-inch SATA 6Gb/s SSD Solid State Disk (READ)",
|
||||
ParsedStorage {
|
||||
id: 0,
|
||||
item: 2,
|
||||
total_gigabytes: 1024,
|
||||
quantity: 1,
|
||||
individual_size_gigabytes: 1024,
|
||||
parse_engine: 0,
|
||||
needed_description_check: true,
|
||||
},
|
||||
false, // Sadly this one fails :/
|
||||
),
|
||||
(
|
||||
"Hitachi VSP 7TB Flash Module Drive (FMD) 3286734-A DKC-F810I-7R0FP",
|
||||
ParsedStorage {
|
||||
id: 0,
|
||||
item: 4,
|
||||
total_gigabytes: 7 * 1024,
|
||||
quantity: 1,
|
||||
individual_size_gigabytes: 7 * 1024,
|
||||
parse_engine: 0,
|
||||
needed_description_check: false,
|
||||
},
|
||||
true,
|
||||
),
|
||||
(
|
||||
"(Lot of 6) Samsung MZ-VLB2560 256GB M.2 NVMe Internal SSD (MZVLB256HBHQ-000H1)",
|
||||
ParsedStorage {
|
||||
id: 0,
|
||||
item: 5,
|
||||
total_gigabytes: 6 * 256,
|
||||
quantity: 6,
|
||||
individual_size_gigabytes: 256,
|
||||
parse_engine: 0,
|
||||
needed_description_check: false,
|
||||
},
|
||||
true,
|
||||
),
|
||||
(
|
||||
"Lenovo-Micron 5300 Pro 1.92TB 2.5\" Sata SSD 02JG540 (MTFDDAK1T9TDS)",
|
||||
ParsedStorage {
|
||||
id: 0,
|
||||
item: 6,
|
||||
total_gigabytes: 1966,
|
||||
quantity: 1,
|
||||
individual_size_gigabytes: 1966,
|
||||
parse_engine: 0,
|
||||
needed_description_check: false,
|
||||
},
|
||||
true,
|
||||
),
|
||||
];
|
||||
|
||||
#[test_log::test]
|
||||
fn parse() {
|
||||
for t in TESTS {
|
||||
if t.2 {
|
||||
assert_eq!(t.1, parse_size_and_quantity(t.1.item, t.0));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
74
src/xdg_dirs.rs
Normal file
74
src/xdg_dirs.rs
Normal file
@ -0,0 +1,74 @@
|
||||
use std::path::PathBuf;
|
||||
use tracing::{info, warn};
|
||||
|
||||
enum XdgType {
|
||||
XdgDataHome,
|
||||
|
||||
#[allow(dead_code)]
|
||||
XdgConfigHome,
|
||||
#[allow(dead_code)]
|
||||
XdgCacheHome,
|
||||
#[allow(dead_code)]
|
||||
XdgStateHome,
|
||||
#[allow(dead_code)]
|
||||
XdgRuntimeDir,
|
||||
}
|
||||
|
||||
fn ensure_xdg_dir_exists(app_name: &str, xdg_type: XdgType) -> PathBuf {
|
||||
let dir = match xdg_type {
|
||||
XdgType::XdgDataHome => dirs::data_local_dir().unwrap_or_else(|| {
|
||||
let d = dirs::home_dir().unwrap().join(".local").join(".share");
|
||||
warn!(
|
||||
"OS returned no data local dir (XDG_DATA_HOME), HOME is {:?}, using {:?}!",
|
||||
dirs::home_dir(),
|
||||
d
|
||||
);
|
||||
d
|
||||
}),
|
||||
XdgType::XdgConfigHome => todo!("Not yet tested/implimented"),
|
||||
XdgType::XdgCacheHome => todo!("Not yet tested/implimented"),
|
||||
XdgType::XdgStateHome => todo!("Not yet tested/implimented"),
|
||||
XdgType::XdgRuntimeDir => todo!("Not yet tested/implimented"),
|
||||
};
|
||||
if !std::fs::exists(&dir).unwrap() {
|
||||
panic!(
|
||||
"Base directory to use for this app does not exist at {:?}",
|
||||
dir
|
||||
);
|
||||
}
|
||||
|
||||
let dir = dir.join(app_name);
|
||||
if !std::fs::exists(&dir).unwrap() {
|
||||
info!(
|
||||
"App directory to use for this app does not exist at {:?}, creating now.",
|
||||
dir
|
||||
);
|
||||
std::fs::create_dir(&dir).unwrap();
|
||||
}
|
||||
dir
|
||||
}
|
||||
|
||||
pub fn ensure_scrapedata_dir_exists(app_name: &str, override_path: Option<PathBuf>) -> PathBuf {
|
||||
if override_path.is_some() {
|
||||
if !std::fs::exists(&override_path.clone().unwrap()).unwrap() {
|
||||
panic!(
|
||||
"Override path of {:?} was given but does not eixst, bailing.",
|
||||
override_path.unwrap()
|
||||
);
|
||||
}
|
||||
return override_path.unwrap();
|
||||
}
|
||||
|
||||
let app_dir = ensure_xdg_dir_exists(app_name, XdgType::XdgDataHome);
|
||||
|
||||
let raw_scraped_dir = override_path.unwrap_or(app_dir.join("raw_scraped"));
|
||||
if !std::fs::exists(&raw_scraped_dir).unwrap() {
|
||||
info!(
|
||||
"scrape directory to use for this app does not exist at {:?}, creating now.",
|
||||
raw_scraped_dir
|
||||
);
|
||||
std::fs::create_dir(&raw_scraped_dir).unwrap();
|
||||
}
|
||||
|
||||
raw_scraped_dir
|
||||
}
|
11
systemd/scraper_fetch.service
Normal file
11
systemd/scraper_fetch.service
Normal file
@ -0,0 +1,11 @@
|
||||
[Unit]
|
||||
Description=Run a single instance of a fetch
|
||||
After=syslog.target network.target
|
||||
|
||||
[Service]
|
||||
Type=exec
|
||||
#Environment=XDG_DATA_HOME=/home/hak8or/code/ebay_scraper_rust/.tmp_run
|
||||
ExecStart=/usr/local/bin/scraper_fetch.sh
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
31
systemd/scraper_fetch.sh
Executable file
31
systemd/scraper_fetch.sh
Executable file
@ -0,0 +1,31 @@
|
||||
#!/bin/env bash
|
||||
|
||||
URL_PER_PAGE_60="&_ipg=60"
|
||||
URL_PER_PAGE_240="&_ipg=240"
|
||||
URL_MIN_PRICE_USD_60="&_udlo=60.00"
|
||||
URL_SEARCHTERM_NONE="&_nkw="
|
||||
URL_LOCATION_NORTHAMERICA="&LH_PrefLoc=3"
|
||||
URL_BASE_LISTING="https://www.ebay.com/sch/i.html?"
|
||||
|
||||
if [ -z "${XDG_DATA_HOME}" ]; then
|
||||
echo "XDG_DATA_HOME was not set, bailing!"
|
||||
exit
|
||||
fi
|
||||
|
||||
DIR_SSDS="$XDG_DATA_HOME/ebay_scraper/raw_scraped/ssd"
|
||||
mkdir -p "$DIR_SSDS"
|
||||
if [ ! -s "$DIR_SSDS/url.json" ]; then
|
||||
URL_CATEGORY_SSD="&_sacat=175669"
|
||||
URL_SSDS="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_SSD&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1&_sop=10$URL_PER_PAGE_240"
|
||||
echo "{\"url\": \"$URL_SSDS\"}" > "$DIR_SSDS/url.json"
|
||||
fi
|
||||
wget -o "$DIR_SSDS/$(date +%s).html" "$(jq '.url' $DIR_SSDS/url.json)"
|
||||
|
||||
DIR_MINIPC="$XDG_DATA_HOME/ebay_scraper/raw_scraped/minipc"
|
||||
mkdir -p "$DIR_MINIPC"
|
||||
if [ ! -s "$DIR_MINIPC/url.json" ]; then
|
||||
URL_CATEGORY_MINIPC_ALLINONE="&_sacat=179"
|
||||
URL_MINIPC="$URL_BASE_LISTING$URL_SEARCHTERM_NONE$URL_CATEGORY_MINIPC_ALLINONE&_from=R40&_fsrp=1$URL_LOCATION_NORTHAMERICA&imm=1&_sop=10$URL_PER_PAGE_240"
|
||||
echo "{\"url\": \"$URL_MINIPC\"}" > "$DIR_MINIPC/url.json"
|
||||
fi
|
||||
wget -o "$DIR_MINIPC/$(date +%s).html" "$(jq '.url' $DIR_MINIPC/url.json)"
|
10
systemd/scraper_fetch.timer
Normal file
10
systemd/scraper_fetch.timer
Normal file
@ -0,0 +1,10 @@
|
||||
[Unit]
|
||||
Description=Run a fetch of a website
|
||||
|
||||
[Timer]
|
||||
OnBootSec=15min
|
||||
OnUnitActiveSec=1h
|
||||
Persistent=true
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
59
test_data/ebay_scraper/raw_scraped/minipc/1750637335.html
Normal file
59
test_data/ebay_scraper/raw_scraped/minipc/1750637335.html
Normal file
@ -0,0 +1,59 @@
|
||||
--2025-06-22 20:08:55-- https://www.ebay.com/sch/i.html?&_nkw=&_sacat=179&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240
|
||||
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
|
||||
Resolving www.ebay.com (www.ebay.com)... 23.56.163.160
|
||||
Connecting to www.ebay.com (www.ebay.com)|23.56.163.160|:443... connected.
|
||||
HTTP request sent, awaiting response... 200 OK
|
||||
Length: unspecified [text/html]
|
||||
Saving to: ‘i.html?&_nkw=&_sacat=179&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240’
|
||||
|
||||
0K .......... .......... .......... .......... .......... 6.28M
|
||||
50K .......... .......... .......... .......... .......... 76.1K
|
||||
100K .......... .......... .......... .......... .......... 18.6M
|
||||
150K .......... .......... .......... .......... .......... 12.7M
|
||||
200K .......... .......... .......... .......... .......... 34.4M
|
||||
250K .......... .......... .......... .......... .......... 25.0M
|
||||
300K .......... .......... .......... .......... .......... 41.3M
|
||||
350K .......... .......... .......... .......... .......... 114M
|
||||
400K .......... .......... .......... .......... .......... 73.4M
|
||||
450K .......... .......... .......... .......... .......... 33.5M
|
||||
500K .......... .......... .......... .......... .......... 50.2M
|
||||
550K .......... .......... .......... .......... .......... 76.2M
|
||||
600K .......... .......... .......... .......... .......... 109M
|
||||
650K .......... .......... .......... .......... .......... 61.5M
|
||||
700K .......... .......... .......... .......... .......... 81.1M
|
||||
750K .......... .......... .......... .......... .......... 337M
|
||||
800K .......... .......... .......... .......... .......... 118M
|
||||
850K .......... .......... .......... .......... .......... 85.5M
|
||||
900K .......... .......... .......... .......... .......... 92.6M
|
||||
950K .......... .......... .......... .......... .......... 96.7M
|
||||
1000K .......... .......... .......... .......... .......... 84.6M
|
||||
1050K .......... .......... .......... .......... .......... 500M
|
||||
1100K .......... .......... .......... .......... .......... 109M
|
||||
1150K .......... .......... .......... .......... .......... 83.5M
|
||||
1200K .......... .......... .......... .......... .......... 160M
|
||||
1250K .......... .......... .......... .......... .......... 141M
|
||||
1300K .......... .......... .......... .......... .......... 41.7M
|
||||
1350K .......... .......... .......... .......... .......... 96.4M
|
||||
1400K .......... .......... .......... .......... .......... 2.47M
|
||||
1450K .......... .......... .......... .......... .......... 36.6M
|
||||
1500K .......... .......... .......... .......... .......... 83.5M
|
||||
1550K .......... .......... .......... .......... .......... 71.7M
|
||||
1600K .......... .......... .......... .......... .......... 37.7M
|
||||
1650K .......... .......... .......... .......... .......... 104M
|
||||
1700K .......... .......... .......... .......... .......... 73.7M
|
||||
1750K .......... .......... .......... .......... .......... 115M
|
||||
1800K .......... .......... .......... .......... .......... 85.3M
|
||||
1850K .......... .......... .......... .......... .......... 140M
|
||||
1900K .......... .......... .......... .......... .......... 71.1M
|
||||
1950K .......... .......... .......... .......... .......... 112M
|
||||
2000K .......... .......... .......... .......... .......... 75.4M
|
||||
2050K .......... .......... .......... .......... .......... 120M
|
||||
2100K .......... .......... .......... .......... .......... 112M
|
||||
2150K .......... .......... .......... .......... .......... 117M
|
||||
2200K .......... .......... .......... .......... .......... 108M
|
||||
2250K .......... .......... .......... .......... .......... 97.1M
|
||||
2300K .......... .......... .......... .......... .......... 31.8M
|
||||
2350K ...... 11.4T=0.7s
|
||||
|
||||
2025-06-22 20:08:56 (3.20 MB/s) - ‘i.html?&_nkw=&_sacat=179&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240’ saved [2412662]
|
||||
|
1
test_data/ebay_scraper/raw_scraped/minipc/url.json
Normal file
1
test_data/ebay_scraper/raw_scraped/minipc/url.json
Normal file
@ -0,0 +1 @@
|
||||
{"url": "https://www.ebay.com/sch/i.html?&_nkw=&_sacat=179&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240"}
|
8462
test_data/ebay_scraper/raw_scraped/ssd/1750369463.html
Normal file
8462
test_data/ebay_scraper/raw_scraped/ssd/1750369463.html
Normal file
File diff suppressed because one or more lines are too long
63
test_data/ebay_scraper/raw_scraped/ssd/1750637297.html
Normal file
63
test_data/ebay_scraper/raw_scraped/ssd/1750637297.html
Normal file
@ -0,0 +1,63 @@
|
||||
--2025-06-22 20:08:17-- https://www.ebay.com/sch/i.html?&_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240
|
||||
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
|
||||
Resolving www.ebay.com (www.ebay.com)... 23.56.163.160
|
||||
Connecting to www.ebay.com (www.ebay.com)|23.56.163.160|:443... connected.
|
||||
HTTP request sent, awaiting response... 200 OK
|
||||
Length: unspecified [text/html]
|
||||
Saving to: ‘i.html?&_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240’
|
||||
|
||||
0K .......... .......... .......... .......... .......... 8.04M
|
||||
50K .......... .......... .......... .......... .......... 83.3K
|
||||
100K .......... .......... .......... .......... .......... 1.38M
|
||||
150K .......... .......... .......... .......... .......... 7.12M
|
||||
200K .......... .......... .......... .......... .......... 18.8M
|
||||
250K .......... .......... .......... .......... .......... 18.0M
|
||||
300K .......... .......... .......... .......... .......... 19.4M
|
||||
350K .......... .......... .......... .......... .......... 48.4M
|
||||
400K .......... .......... .......... .......... .......... 45.9M
|
||||
450K .......... .......... .......... .......... .......... 50.4M
|
||||
500K .......... .......... .......... .......... .......... 50.1M
|
||||
550K .......... .......... .......... .......... .......... 119M
|
||||
600K .......... .......... .......... .......... .......... 45.4M
|
||||
650K .......... .......... .......... .......... .......... 44.1M
|
||||
700K .......... .......... .......... .......... .......... 59.1M
|
||||
750K .......... .......... .......... .......... .......... 84.0M
|
||||
800K .......... .......... .......... .......... .......... 167M
|
||||
850K .......... .......... .......... .......... .......... 76.6M
|
||||
900K .......... .......... .......... .......... .......... 59.4M
|
||||
950K .......... .......... .......... .......... .......... 60.3M
|
||||
1000K .......... .......... .......... .......... .......... 113M
|
||||
1050K .......... .......... .......... .......... .......... 592M
|
||||
1100K .......... .......... .......... .......... .......... 53.9M
|
||||
1150K .......... .......... .......... .......... .......... 101M
|
||||
1200K .......... .......... .......... .......... .......... 91.9M
|
||||
1250K .......... .......... .......... .......... .......... 108M
|
||||
1300K .......... .......... .......... .......... .......... 85.2M
|
||||
1350K .......... .......... .......... .......... .......... 96.9M
|
||||
1400K .......... .......... .......... .......... .......... 93.5M
|
||||
1450K .......... .......... .......... .......... .......... 51.2M
|
||||
1500K .......... .......... .......... .......... .......... 69.9M
|
||||
1550K .......... .......... .......... .......... .......... 654M
|
||||
1600K .......... .......... .......... .......... .......... 185M
|
||||
1650K .......... .......... .......... .......... .......... 9.94M
|
||||
1700K .......... .......... .......... .......... .......... 27.5M
|
||||
1750K .......... .......... .......... .......... .......... 613M
|
||||
1800K .......... .......... .......... .......... .......... 659M
|
||||
1850K .......... .......... .......... .......... .......... 21.3M
|
||||
1900K .......... .......... .......... .......... .......... 107M
|
||||
1950K .......... .......... .......... .......... .......... 158M
|
||||
2000K .......... .......... .......... .......... .......... 37.8M
|
||||
2050K .......... .......... .......... .......... .......... 85.2M
|
||||
2100K .......... .......... .......... .......... .......... 26.0M
|
||||
2150K .......... .......... .......... .......... .......... 57.1M
|
||||
2200K .......... .......... .......... .......... .......... 114M
|
||||
2250K .......... .......... .......... .......... .......... 117M
|
||||
2300K .......... .......... .......... .......... .......... 57.9M
|
||||
2350K .......... .......... .......... .......... .......... 127M
|
||||
2400K .......... .......... .......... .......... .......... 118M
|
||||
2450K .......... .......... .......... .......... .......... 62.1M
|
||||
2500K .......... .......... .......... .......... .......... 157M
|
||||
2550K 723G=0.7s
|
||||
|
||||
2025-06-22 20:08:18 (3.60 MB/s) - ‘i.html?&_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240’ saved [2611588]
|
||||
|
63
test_data/ebay_scraper/raw_scraped/ssd/1750637334.html
Normal file
63
test_data/ebay_scraper/raw_scraped/ssd/1750637334.html
Normal file
@ -0,0 +1,63 @@
|
||||
--2025-06-22 20:08:54-- https://www.ebay.com/sch/i.html?&_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240
|
||||
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
|
||||
Resolving www.ebay.com (www.ebay.com)... 23.56.163.160
|
||||
Connecting to www.ebay.com (www.ebay.com)|23.56.163.160|:443... connected.
|
||||
HTTP request sent, awaiting response... 200 OK
|
||||
Length: unspecified [text/html]
|
||||
Saving to: ‘i.html?&_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240.2’
|
||||
|
||||
0K .......... .......... .......... .......... .......... 4.98M
|
||||
50K .......... .......... .......... .......... .......... 75.7K
|
||||
100K .......... .......... .......... .......... .......... 4.26M
|
||||
150K .......... .......... .......... .......... .......... 10.1M
|
||||
200K .......... .......... .......... .......... .......... 25.3M
|
||||
250K .......... .......... .......... .......... .......... 27.1M
|
||||
300K .......... .......... .......... .......... .......... 37.0M
|
||||
350K .......... .......... .......... .......... .......... 31.8M
|
||||
400K .......... .......... .......... .......... .......... 58.2M
|
||||
450K .......... .......... .......... .......... .......... 44.6M
|
||||
500K .......... .......... .......... .......... .......... 40.7M
|
||||
550K .......... .......... .......... .......... .......... 48.7M
|
||||
600K .......... .......... .......... .......... .......... 719M
|
||||
650K .......... .......... .......... .......... .......... 62.0M
|
||||
700K .......... .......... .......... .......... .......... 61.0M
|
||||
750K .......... .......... .......... .......... .......... 144M
|
||||
800K .......... .......... .......... .......... .......... 270M
|
||||
850K .......... .......... .......... .......... .......... 36.5M
|
||||
900K .......... .......... .......... .......... .......... 64.1M
|
||||
950K .......... .......... .......... .......... .......... 204M
|
||||
1000K .......... .......... .......... .......... .......... 102M
|
||||
1050K .......... .......... .......... .......... .......... 90.0M
|
||||
1100K .......... .......... .......... .......... .......... 179M
|
||||
1150K .......... .......... .......... .......... .......... 132M
|
||||
1200K .......... .......... .......... .......... .......... 84.9M
|
||||
1250K .......... .......... .......... .......... .......... 90.3M
|
||||
1300K .......... .......... .......... .......... .......... 141M
|
||||
1350K .......... .......... .......... .......... .......... 187M
|
||||
1400K .......... .......... .......... .......... .......... 116M
|
||||
1450K .......... .......... .......... .......... .......... 86.2M
|
||||
1500K .......... .......... .......... .......... .......... 118M
|
||||
1550K .......... .......... .......... .......... .......... 113M
|
||||
1600K .......... .......... .......... .......... .......... 120M
|
||||
1650K .......... .......... .......... .......... .......... 113M
|
||||
1700K .......... .......... .......... .......... .......... 113M
|
||||
1750K .......... .......... .......... .......... .......... 107M
|
||||
1800K .......... .......... .......... .......... .......... 113M
|
||||
1850K .......... .......... .......... .......... .......... 5.40M
|
||||
1900K .......... .......... .......... .......... .......... 93.9M
|
||||
1950K .......... .......... .......... .......... .......... 104M
|
||||
2000K .......... .......... .......... .......... .......... 85.4M
|
||||
2050K .......... .......... .......... .......... .......... 126M
|
||||
2100K .......... .......... .......... .......... .......... 27.8M
|
||||
2150K .......... .......... .......... .......... .......... 9.09M
|
||||
2200K .......... .......... .......... .......... .......... 119M
|
||||
2250K .......... .......... .......... .......... .......... 17.0M
|
||||
2300K .......... .......... .......... .......... .......... 21.5M
|
||||
2350K .......... .......... .......... .......... .......... 128M
|
||||
2400K .......... .......... .......... .......... .......... 117M
|
||||
2450K .......... .......... .......... .......... .......... 88.9M
|
||||
2500K .......... .......... .......... .......... .......... 16.9M
|
||||
2550K .. 5.53T=0.7s
|
||||
|
||||
2025-06-22 20:08:55 (3.38 MB/s) - ‘i.html?&_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240.2’ saved [2614240]
|
||||
|
1
test_data/ebay_scraper/raw_scraped/ssd/url.json
Normal file
1
test_data/ebay_scraper/raw_scraped/ssd/url.json
Normal file
@ -0,0 +1 @@
|
||||
{"url": "https://www.ebay.com/sch/i.html?&_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10&_ipg=240"}
|
Reference in New Issue
Block a user