Files
ebay_scraper_rust/src/main.rs
2025-06-25 22:56:09 -04:00

191 lines
5.5 KiB
Rust

use actix_web::{App, HttpServer, Responder, Result, get, post, web, web::Data};
use clap::Parser;
use ebay_scraper_rust::db::{Listing, ParsedPage, ParsedStorage, SearchURL, get_initialized};
use ebay_scraper_rust::{parser_ebay, parser_storage};
use std::path::{Path, PathBuf};
use std::sync::Mutex;
use tracing::info;
mod xdg_dirs;
#[derive(Parser, Debug)]
#[clap(
name = "ebay-scraper-rust",
version = "0.1.0",
about = "Scrapes eBay search results for homelab purposes"
)]
struct Args {}
#[get("/page/{timestamp}")]
async fn page_get(
db: Data<Mutex<rusqlite::Connection>>,
timestamp: web::Path<i64>,
) -> Result<impl Responder> {
Ok(web::Json(ParsedPage::lookup_db(
&db.lock().unwrap(),
chrono::DateTime::from_timestamp(*timestamp, 0).unwrap(),
)))
}
#[get("/listing/{id}")]
async fn listing_get(
db: Data<Mutex<rusqlite::Connection>>,
id: web::Path<i64>,
) -> Result<impl Responder> {
Ok(web::Json(Listing::lookup(&db.lock().unwrap(), *id)))
}
#[get("/listing/since/{timestamp}/{limit}")]
async fn listing_since_get(
db: Data<Mutex<rusqlite::Connection>>,
req: web::Path<(i64, i64)>,
) -> Result<impl Responder> {
Ok(web::Json(Listing::lookup_since(
&db.lock().unwrap(),
req.0,
req.1,
)))
}
#[post("listing/parse")]
async fn parse_listings(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
let mut cnt = 0;
let db_unlocked = db.lock().unwrap();
Listing::lookup_non_parsed(&db_unlocked)
.iter()
.map(|l| parser_storage::parse_size_and_quantity(l.0, &l.1))
.inspect(|_| cnt = cnt + 1)
.for_each(|ps| ps.add_or_update_db(&db_unlocked));
Ok(web::Json(cnt))
}
#[get("listing/parse/{id}")]
async fn listing_parse_get(
db: Data<Mutex<rusqlite::Connection>>,
id: web::Path<i64>,
) -> Result<impl Responder> {
Ok(web::Json(ParsedStorage::lookup_db(
&db.lock().unwrap(),
*id,
)))
}
pub fn timestamps_from_dir(path: &Path) -> Vec<i64> {
if !std::fs::exists(path).expect("Directory must exist") {
panic!(
"Directory {:?} does not exist, cannot grab timestamps from there.",
path
);
}
std::fs::read_dir(path)
.unwrap()
.inspect(|fpath| info!("Found {:?}", fpath))
.map(|fpath| fpath.unwrap().path())
.filter_map(|fstem| {
fstem
.file_stem()
.and_then(|s| s.to_str())
.expect("Invalid file name")
.parse()
.ok()
})
.collect()
}
#[post("page/parse/{category}")]
async fn parse_post(
db: Data<Mutex<rusqlite::Connection>>,
downloaddir: Data<PathBuf>,
category: web::Path<String>,
) -> Result<impl Responder> {
let dir = &downloaddir.join(category.clone());
// Ensure the category is created.
let url: serde_json::Value =
serde_json::from_str(&std::fs::read_to_string(dir.join("url.json")).unwrap()).unwrap();
info!("{:?}", url);
let su = SearchURL {
full_url: url.to_string(),
name: category.to_string(),
};
info!("{:?}", su);
su.add_or_update(&db.lock().unwrap());
let added: u64 = timestamps_from_dir(dir)
.iter()
.filter(|t| {
info!("Checking for the existance of page {t}");
let p = ParsedPage::lookup_db(
&db.lock().unwrap(),
chrono::DateTime::from_timestamp(**t, 0).unwrap(),
);
// Timestamp never seen before, lets pass it on.
if p.is_none() {
return true;
}
// Timestamp was seen before *and* from the same catagory, don't pass
// it on.
if p.unwrap().category == *category {
return false;
}
return true;
})
.map(|t| {
let timestamp = chrono::DateTime::from_timestamp(*t, 0).unwrap();
info!("Adding or updating db with timestamp:{timestamp} catagory:{category}");
ParsedPage {
timestamp: timestamp,
category: category.to_string(),
}
.add_or_update_db(&db.lock().unwrap());
let mut cnt = 0;
parser_ebay::extract_data_from_html(
&std::fs::read_to_string(dir.join(format!("{t}.html"))).unwrap(),
&timestamp,
)
.unwrap()
.iter()
.for_each(|l| {
cnt = cnt + 1;
l.add_or_update(&db.lock().unwrap());
info!("Inserting id:{}, title:{}", l.item_id, l.title);
});
cnt
})
.sum();
Ok(added.to_string())
}
#[actix_web::main]
async fn main() -> std::io::Result<()> {
env_logger::init_from_env(env_logger::Env::new().default_filter_or("info"));
let _ = Args::parse();
let scrapedatadir = xdg_dirs::ensure_scrapedata_dir_exists("ebay_scraper", None);
info!(
"Starting with scraped data dir of \"{}\".",
scrapedatadir.to_str().unwrap()
);
let db_mutex = Data::new(Mutex::new(get_initialized(None)));
HttpServer::new(move || {
App::new()
.service(page_get)
.service(listing_get)
.service(listing_since_get)
.service(parse_post)
.service(parse_listings)
.app_data(db_mutex.clone())
.app_data(Data::new(scrapedatadir.clone()))
})
.bind(("127.0.0.1", 8080))?
.run()
.await
}