ebay_scraper_rust/src/main.rs

use actix_web::{App, HttpServer, Responder, Result, get, post, web, web::Data};
use chrono::DateTime;
use clap::Parser;
use ebay_scraper_rust::db::{
    ItemAppearances, Listing, ParsedPage, ParsedStorage, SearchURL, get_initialized,
};
use ebay_scraper_rust::{parser_ebay, parser_storage};
use std::path::{Path, PathBuf};
use std::sync::Mutex;
use tracing::info;

mod xdg_dirs;

#[derive(Parser, Debug)]
#[clap(
    name = "ebay-scraper-rust",
    version = "0.1.0",
    about = "Scrapes eBay search results for homelab purposes"
)]
struct Args {}

#[get("/page/{timestamp}")]
async fn page_get(
    db: Data<Mutex<rusqlite::Connection>>,
    timestamp: web::Path<i64>,
) -> Result<impl Responder> {
    Ok(web::Json(ParsedPage::lookup(
        &db.lock().unwrap(),
        chrono::DateTime::from_timestamp(*timestamp, 0).unwrap(),
    )))
}

#[get("/listing/{id}/history")]
async fn listing_history_get(
    db: Data<Mutex<rusqlite::Connection>>,
    id: web::Path<i64>,
) -> Result<impl Responder> {
    let history: Vec<_> = ItemAppearances::lookup(&db.lock().unwrap(), *id)
        .iter()
        .inspect(|e| info!("got: {:?}", e))
        .filter_map(|e| Some((e.timestamp, e.current_bid_usd_cents?)))
        .collect();
    Ok(web::Json(history))
}

#[get("/listing/{id}")]
async fn listing_get(
    db: Data<Mutex<rusqlite::Connection>>,
    id: web::Path<i64>,
) -> Result<impl Responder> {
    Ok(web::Json(Listing::lookup(&db.lock().unwrap(), *id)))
}

#[get("/listing/since/{timestamp}/{limit}")]
async fn listing_since_get(
    db: Data<Mutex<rusqlite::Connection>>,
    req: web::Path<(i64, i64)>,
) -> Result<impl Responder> {
    Ok(web::Json(Listing::lookup_since(
        &db.lock().unwrap(),
        DateTime::from_timestamp(req.0, 0).unwrap(),
        req.1,
    )))
}

#[post("listing/parse")]
async fn parse_listings(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Responder> {
    let mut cnt = 0;
    let db_unlocked = db.lock().unwrap();
    Listing::lookup_non_parsed(&db_unlocked)
        .iter()
        .map(|l| parser_storage::parse_size_and_quantity(l.0, &l.1))
        .inspect(|_| cnt = cnt + 1)
        .for_each(|ps| ps.add_or_update(&db_unlocked));

    Ok(web::Json(cnt))
}

#[get("listing/parse/{id}")]
async fn listing_parse_get(
    db: Data<Mutex<rusqlite::Connection>>,
    id: web::Path<i64>,
) -> Result<impl Responder> {
    Ok(web::Json(ParsedStorage::lookup(&db.lock().unwrap(), *id)))
}

pub fn timestamps_from_dir(path: &Path) -> Vec<i64> {
    if !std::fs::exists(path).expect("Directory must exist") {
        panic!(
            "Directory {:?} does not exist, cannot grab timestamps from there.",
            path
        );
    }

    std::fs::read_dir(path)
        .unwrap()
        .inspect(|fpath| info!("Found {:?}", fpath))
        .map(|fpath| fpath.unwrap().path())
        .filter_map(|fstem| {
            fstem
                .file_stem()
                .and_then(|s| s.to_str())
                .expect("Invalid file name")
                .parse()
                .ok()
        })
        .collect()
}

#[post("page/parse/{category}")]
async fn parse_post(
    db: Data<Mutex<rusqlite::Connection>>,
    downloaddir: Data<PathBuf>,
    category: web::Path<String>,
) -> Result<impl Responder> {
    let dir = &downloaddir.join(category.clone());

    // Ensure the category is created.
    let url: serde_json::Value =
        serde_json::from_str(&std::fs::read_to_string(dir.join("url.json")).unwrap()).unwrap();
    let su = SearchURL {
        full_url: url.to_string(),
        name: category.to_string(),
    };
    su.add_or_update(&db.lock().unwrap());

    // Find all pages.
    let pages = timestamps_from_dir(dir);

    // See what pages haven't been seen before.
    let to_parse = pages.iter().filter(|t| {
        let ts = chrono::DateTime::from_timestamp(**t, 0).unwrap();
        info!("Checking if page with a timestamp of {ts} and catagory of {category} exists");
        let p = ParsedPage::lookup(&db.lock().unwrap(), ts);

        // Timestamp never seen before, lets pass it on.
        if p.is_none() {
            return true;
        }

        // Timestamp was seen before *and* from the same catagory, don't pass
        // it on.
        if p.unwrap().category == *category {
            return false;
        }
        return true;
    });

    let mut added_count = 0;
    for p in to_parse {
        let ts = chrono::DateTime::from_timestamp(*p, 0).unwrap();
        info!("Adding page with a timestamp of {ts} and catagory of {category} to db");
        ParsedPage {
            timestamp: ts,
            category: category.to_string(),
        }
        .add_or_update(&db.lock().unwrap());

        let elements = parser_ebay::extract_data_from_html(
            &std::fs::read_to_string(dir.join(format!("{ts}.html"))).unwrap(),
            &ts,
            &category,
        )
        .unwrap();

        added_count += elements.len();
        for e in elements {
            e.0.add_or_update(&db.lock().unwrap());
            e.1.add_or_update(&db.lock().unwrap());
            info!("Inserting id:{}, title:{}", e.0.item_id, e.0.title);
        }
    }

    Ok(added_count.to_string())
}

#[actix_web::main]
async fn main() -> std::io::Result<()> {
    env_logger::init_from_env(env_logger::Env::new().default_filter_or("info"));
    let _ = Args::parse();

    let scrapedatadir = xdg_dirs::ensure_scrapedata_dir_exists("ebay_scraper", None);
    info!(
        "Starting with scraped data dir of \"{}\".",
        scrapedatadir.to_str().unwrap()
    );
    let db_mutex = Data::new(Mutex::new(get_initialized(None)));

    HttpServer::new(move || {
        App::new()
            .service(page_get)
            .service(listing_get)
            .service(listing_history_get)
            .service(listing_since_get)
            .service(parse_post)
            .service(parse_listings)
            .app_data(db_mutex.clone())
            .app_data(Data::new(scrapedatadir.clone()))
    })
    .bind(("127.0.0.1", 8080))?
    .run()
    .await
}