226 lines
6.6 KiB
Rust
226 lines
6.6 KiB
Rust
use actix_web::{App, HttpServer, Responder, Result, get, post, web, web::Data};
|
|
use chrono::DateTime;
|
|
use clap::Parser;
|
|
use ebay_scraper_rust::db::{
|
|
ItemAppearances, Listing, ParsedPage, ParsedStorage, SearchURL, get_initialized,
|
|
};
|
|
use ebay_scraper_rust::{parser_ebay, parser_storage};
|
|
use std::path::{Path, PathBuf};
|
|
use std::sync::Mutex;
|
|
use std::time::Instant;
|
|
use tracing::{error, info, instrument};
|
|
|
|
use tracing_subscriber::filter::EnvFilter;
|
|
use tracing_subscriber::fmt;
|
|
use tracing_subscriber::prelude::__tracing_subscriber_SubscriberExt;
|
|
use tracing_subscriber::util::SubscriberInitExt;
|
|
|
|
mod xdg_dirs;
|
|
|
|
#[derive(Parser, Debug)]
|
|
#[clap(
|
|
name = "ebay-scraper-rust",
|
|
version = "0.1.0",
|
|
about = "Scrapes eBay search results for homelab purposes"
|
|
)]
|
|
struct Args {}
|
|
|
|
#[get("/page/{timestamp}")]
|
|
async fn page_get(
|
|
db: Data<Mutex<rusqlite::Connection>>,
|
|
timestamp: web::Path<i64>,
|
|
) -> Result<impl Responder> {
|
|
Ok(web::Json(ParsedPage::lookup(
|
|
&db.lock().unwrap(),
|
|
chrono::DateTime::from_timestamp(*timestamp, 0).unwrap(),
|
|
)))
|
|
}
|
|
|
|
#[get("/listing/{id}/history")]
|
|
async fn listing_history_get(
|
|
db: Data<Mutex<rusqlite::Connection>>,
|
|
id: web::Path<i64>,
|
|
) -> Result<impl Responder> {
|
|
let history: Vec<_> = ItemAppearances::lookup(&db.lock().unwrap(), *id)
|
|
.iter()
|
|
.inspect(|e| info!("got: {:?}", e))
|
|
.filter_map(|e| Some((e.timestamp, e.current_bid_usd_cents?)))
|
|
.collect();
|
|
Ok(web::Json(history))
|
|
}
|
|
|
|
#[get("/listing/{id}")]
|
|
async fn listing_get(
|
|
db: Data<Mutex<rusqlite::Connection>>,
|
|
id: web::Path<i64>,
|
|
) -> Result<impl Responder> {
|
|
Ok(web::Json(Listing::lookup(&db.lock().unwrap(), *id)))
|
|
}
|
|
|
|
#[get("/listing/since/{timestamp}/{limit}")]
|
|
async fn listing_since_get(
|
|
db: Data<Mutex<rusqlite::Connection>>,
|
|
req: web::Path<(i64, i64)>,
|
|
) -> Result<impl Responder> {
|
|
Ok(web::Json(Listing::lookup_since(
|
|
&db.lock().unwrap(),
|
|
DateTime::from_timestamp(req.0, 0).unwrap(),
|
|
req.1,
|
|
)))
|
|
}
|
|
|
|
#[post("listing/parse")]
|
|
async fn parse_listings(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
|
|
let mut cnt = 0;
|
|
let db_unlocked = db.lock().unwrap();
|
|
Listing::lookup_non_parsed(&db_unlocked)
|
|
.iter()
|
|
.map(|l| parser_storage::parse_size_and_quantity(l.0, &l.1))
|
|
.inspect(|_| cnt = cnt + 1)
|
|
.for_each(|ps| ps.add_or_update(&db_unlocked));
|
|
|
|
Ok(web::Json(cnt))
|
|
}
|
|
|
|
#[get("listing/parse/{id}")]
|
|
async fn listing_parse_get(
|
|
db: Data<Mutex<rusqlite::Connection>>,
|
|
id: web::Path<i64>,
|
|
) -> Result<impl Responder> {
|
|
Ok(web::Json(ParsedStorage::lookup(&db.lock().unwrap(), *id)))
|
|
}
|
|
|
|
pub fn timestamps_from_dir(path: &Path) -> Vec<i64> {
|
|
if !std::fs::exists(path).expect("Directory must exist") {
|
|
panic!(
|
|
"Directory {:?} does not exist, cannot grab timestamps from there.",
|
|
path
|
|
);
|
|
}
|
|
|
|
std::fs::read_dir(path)
|
|
.unwrap()
|
|
.map(|fpath| fpath.unwrap().path())
|
|
.filter_map(|fstem| {
|
|
fstem
|
|
.file_stem()
|
|
.and_then(|s| s.to_str())
|
|
.expect("Invalid file name")
|
|
.parse()
|
|
.ok()
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
#[post("page/parse/{category}")]
|
|
#[instrument(skip_all)]
|
|
async fn parse_post(
|
|
db: Data<Mutex<rusqlite::Connection>>,
|
|
downloaddir: Data<PathBuf>,
|
|
category: web::Path<String>,
|
|
) -> Result<impl Responder> {
|
|
let dir = &downloaddir.join(category.clone());
|
|
|
|
// Ensure the category is created.
|
|
let url: serde_json::Value =
|
|
serde_json::from_str(&std::fs::read_to_string(dir.join("url.json")).unwrap()).unwrap();
|
|
let su = SearchURL {
|
|
full_url: url.to_string(),
|
|
name: category.to_string(),
|
|
};
|
|
su.add_or_update(&db.lock().unwrap());
|
|
|
|
// Find all pages.
|
|
let pages = timestamps_from_dir(dir);
|
|
|
|
// See what pages haven't been seen before.
|
|
let to_parse = pages.iter().filter(|t| {
|
|
let ts = chrono::DateTime::from_timestamp(**t, 0).unwrap();
|
|
info!("Checking if page with a timestamp of {ts} and catagory of {category} exists");
|
|
let p = ParsedPage::lookup(&db.lock().unwrap(), ts);
|
|
|
|
// Timestamp never seen before, lets pass it on.
|
|
if p.is_none() {
|
|
info!(
|
|
"Page of timestamp:{} and catagory:{category} never seen before, processing ...",
|
|
ts.timestamp()
|
|
);
|
|
return true;
|
|
}
|
|
|
|
// Timestamp was seen before *and* from the same catagory, don't pass
|
|
// it on.
|
|
if p.unwrap().category == *category {
|
|
info!(
|
|
"Page of timestamp:{} and catagory:{category} seen before, skipping ...",
|
|
ts.timestamp()
|
|
);
|
|
return false;
|
|
}
|
|
|
|
info!(
|
|
"Page of timestamp:{} seen before, but not of catagory:{category}, processing ...",
|
|
ts.timestamp()
|
|
);
|
|
return true;
|
|
});
|
|
|
|
let mut added_count = 0;
|
|
for p in to_parse {
|
|
let ts = chrono::DateTime::from_timestamp(*p, 0).unwrap();
|
|
ParsedPage {
|
|
timestamp: ts,
|
|
category: category.to_string(),
|
|
}
|
|
.add_or_update(&db.lock().unwrap());
|
|
|
|
let elements = parser_ebay::extract_data_from_html(
|
|
&std::fs::read_to_string(dir.join(format!("{ts}.html"))).unwrap(),
|
|
&ts,
|
|
&category,
|
|
)
|
|
.unwrap();
|
|
|
|
added_count += elements.len();
|
|
for e in elements {
|
|
e.0.add_or_update(&db.lock().unwrap());
|
|
e.1.add_or_update(&db.lock().unwrap());
|
|
info!("Inserting id:{}, title:{}", e.0.item_id, e.0.title);
|
|
}
|
|
}
|
|
|
|
info!("Added {added_count} listings");
|
|
Ok(added_count.to_string())
|
|
}
|
|
|
|
#[actix_web::main]
|
|
async fn main() -> std::io::Result<()> {
|
|
tracing_subscriber::registry()
|
|
.with(fmt::layer())
|
|
.with(EnvFilter::from_default_env())
|
|
.init();
|
|
let _ = Args::parse();
|
|
|
|
let scrapedatadir = xdg_dirs::ensure_scrapedata_dir_exists("ebay_scraper", None);
|
|
info!(
|
|
"Starting with scraped data dir of \"{}\".",
|
|
scrapedatadir.to_str().unwrap()
|
|
);
|
|
let db_mutex = Data::new(Mutex::new(get_initialized(None)));
|
|
|
|
HttpServer::new(move || {
|
|
App::new()
|
|
.service(page_get)
|
|
.service(listing_get)
|
|
.service(listing_history_get)
|
|
.service(listing_since_get)
|
|
.service(parse_post)
|
|
.service(parse_listings)
|
|
.app_data(db_mutex.clone())
|
|
.app_data(Data::new(scrapedatadir.clone()))
|
|
})
|
|
.bind(("127.0.0.1", 8080))?
|
|
.run()
|
|
.await
|
|
}
|