diff --git a/Cargo.lock b/Cargo.lock index 808455b..0933f7c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -695,7 +695,9 @@ dependencies = [ "chrono", "clap", "dirs", + "futures", "lazy_static", + "num_enum", "rayon", "regex", "reqwest", @@ -848,6 +850,21 @@ dependencies = [ "new_debug_unreachable", ] +[[package]] +name = "futures" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + [[package]] name = "futures-channel" version = "0.3.31" @@ -864,12 +881,34 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" +[[package]] +name = "futures-executor" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + [[package]] name = "futures-io" version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" +[[package]] +name = "futures-macro" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "futures-sink" version = "0.3.31" @@ -888,8 +927,10 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ + "futures-channel", "futures-core", "futures-io", + "futures-macro", "futures-sink", "futures-task", "memchr", @@ -1566,6 +1607,28 @@ dependencies = [ "autocfg", ] +[[package]] +name = "num_enum" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a973b4e44ce6cad84ce69d797acf9a044532e4184c4f267913d1b546a0727b7a" +dependencies = [ + "num_enum_derive", + "rustversion", +] + +[[package]] +name = "num_enum_derive" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77e878c846a8abae00dd069496dbe8751b16ac1c3d6bd2a7283a938e8228f90d" +dependencies = [ + "proc-macro-crate", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "object" version = "0.36.7" @@ -1772,6 +1835,15 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" +[[package]] +name = "proc-macro-crate" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edce586971a4dfaa28950c6f18ed55e0406c1ab88bbce2c6f6293a7aaba73d35" +dependencies = [ + "toml_edit", +] + [[package]] name = "proc-macro2" version = "1.0.95" @@ -2534,6 +2606,23 @@ dependencies = [ "tokio", ] +[[package]] +name = "toml_datetime" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" + +[[package]] +name = "toml_edit" +version = "0.22.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +dependencies = [ + "indexmap", + "toml_datetime", + "winnow", +] + [[package]] name = "tower" version = "0.5.2" @@ -3009,6 +3098,15 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "winnow" +version = "0.7.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21a0236b59786fed61e2a80582dd500fe61f18b5dca67a4a067d0bc9039339cf" +dependencies = [ + "memchr", +] + [[package]] name = "wit-bindgen-rt" version = "0.39.0" diff --git a/Cargo.toml b/Cargo.toml index 335c736..1906a1d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,9 @@ actix-web = "4.11.0" chrono = { version = "0.4.41", features = ["serde"] } clap = { version = "4.5.40", features = ["derive"] } dirs = "6.0.0" +futures = "0.3.31" lazy_static = "1.5.0" +num_enum = "0.7.4" rayon = "1.10.0" regex = "1.11.1" reqwest = { version = "0.12.23", features = ["blocking"] } diff --git a/src/db.rs b/src/db.rs index e28f6f4..c54ba65 100644 --- a/src/db.rs +++ b/src/db.rs @@ -1,5 +1,8 @@ use chrono::{DateTime, Utc}; +use num_enum::TryFromPrimitive; use rusqlite::Connection; +use rusqlite::ToSql; +use rusqlite::types::FromSql; use serde::Deserialize; use serde::Serialize; use std::path::Path; @@ -205,15 +208,34 @@ impl ParsedPage { } } -#[derive(Serialize, Debug, PartialEq, Copy, Clone)] +#[repr(i64)] +#[derive(Serialize, Debug, PartialEq, Copy, Clone, PartialOrd, Ord, Eq, TryFromPrimitive)] +pub enum StorageParsingEngineVersion { + Testing = 0, + Regex = 1, + LLM = 2, +} +impl ToSql for StorageParsingEngineVersion { + fn to_sql(&self) -> rusqlite::Result> { + Ok((*self as i64).into()) + } +} +impl FromSql for StorageParsingEngineVersion { + fn column_result(value: rusqlite::types::ValueRef<'_>) -> rusqlite::types::FromSqlResult { + let v = value.as_i64()?; + Self::try_from(v).map_err(|_| rusqlite::types::FromSqlError::OutOfRange(v)) + } +} + +#[derive(Serialize, Debug, PartialEq, Clone)] pub struct ParsedStorage { pub id: i64, pub item: i64, pub total_gigabytes: i64, pub quantity: i64, pub individual_size_gigabytes: i64, - pub parse_engine: i64, - pub needed_description_check: bool, + pub parse_engine: StorageParsingEngineVersion, + pub failed_reason: String, } impl DBTable for ParsedStorage { const TABLE_NAME: &'static str = "Storage_Parsed"; @@ -224,13 +246,13 @@ impl DBTable for ParsedStorage { quantity INTEGER, sizes_gigabytes TEXT, parse_engine INTEGER, - need_description_check INTEGER, + failed_reason TEXT, UNIQUE(item, parse_engine) FOREIGN KEY(item) REFERENCES Listings(item_id) "; fn get_all(conn: &Connection) -> rusqlite::Result> { - let mut stmt = conn.prepare(&format!("SELECT id, item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, need_description_check FROM {}", Self::TABLE_NAME))?; + let mut stmt = conn.prepare(&format!("SELECT id, item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, failed_reason FROM {}", Self::TABLE_NAME))?; let iter = stmt.query_map([], |row| { Ok(ParsedStorage { id: row.get(0)?, @@ -242,7 +264,7 @@ impl DBTable for ParsedStorage { r.parse().unwrap_or(0) }, parse_engine: row.get(5)?, - needed_description_check: row.get(6)?, + failed_reason: row.get(6)?, }) })?; @@ -273,7 +295,7 @@ impl ParsedStorage { r.parse().unwrap() }, parse_engine: row.get(5)?, - needed_description_check: row.get(6)?, + failed_reason: row.get(6)?, }) }) .ok() @@ -283,21 +305,26 @@ impl ParsedStorage { } pub fn add_or_update(&self, conn: &Connection) { - let _ = conn.execute(&format!(" + let _ = conn + .execute( + &format!( + " INSERT OR REPLACE INTO {} - (item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, need_description_check) + (item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, failed_reason) VALUES (?1, ?2, ?3, ?4, ?5, ?6)", - Self::TABLE_NAME), - ( - &self.item, - self.total_gigabytes, - self.quantity, - self.individual_size_gigabytes.to_string(), - self.parse_engine, - self.needed_description_check + Self::TABLE_NAME + ), + ( + &self.item, + self.total_gigabytes, + self.quantity, + self.individual_size_gigabytes.to_string(), + self.parse_engine, + &self.failed_reason, + ), ) - ).unwrap(); + .unwrap(); } } @@ -494,19 +521,40 @@ impl Listing { .collect() } - pub fn lookup_non_parsed(conn: &Connection) -> Vec<(i64, String)> { - let mut stmt = conn - .prepare(&format!( - " - SELECT ei.item_id, ei.title FROM {} AS ei - LEFT JOIN {} AS sp ON ei.item_id = sp.item - WHERE sp.item IS NULL", - Self::TABLE_NAME, - ParsedStorage::TABLE_NAME - )) + pub fn lookup_pending_parse( + conn: &Connection, + allowed_engines: &[i64], + count_limit: u64, + ) -> Vec<(i64, String)> { + let engines_filter = if !allowed_engines.is_empty() { + format!( + "AND ({})", + allowed_engines + .iter() + .map(|e| "ps.parse_engine = ".to_owned() + &e.to_string()) + .collect::>() + .join(" OR ") + ) + } else { + String::new() + }; + + let query = format!( + " + SELECT listing.item_id, listing.title FROM {0} AS listing + WHERE NOT EXISTS ( + SELECT 1 FROM {1} AS ps + WHERE listing.item_id = ps.item {engines_filter} + ) + LIMIT {count_limit} + ", + Self::TABLE_NAME, + ParsedStorage::TABLE_NAME + ); + conn.prepare(&query) .ok() - .unwrap(); - stmt.query_map([], |row| Ok((row.get(0)?, row.get(1)?))) + .unwrap() + .query_map([], |row| Ok((row.get(0)?, row.get(1)?))) .ok() .unwrap() .map(|e| e.unwrap()) @@ -565,7 +613,7 @@ pub fn listings_get_filtered( history: ItemAppearances::lookup(conn, l.item_id), parsed: ParsedStorage::lookup(conn, l.item_id), }) - .filter(|lr| lr.parsed.iter().any(|p| !p.needed_description_check)) + .filter(|lr| lr.parsed.iter().any(|p| p.failed_reason.is_empty())) .collect::>(); info!( "Found total {} listings since (str:{} epoch:{})", @@ -614,6 +662,24 @@ pub fn listings_get_filtered( listings } +#[repr(i64)] +#[derive(Serialize, Debug, PartialEq, Copy, Clone, TryFromPrimitive)] +pub enum StorageLLMVersion { + Testing = 0, + Gemini2d5Prompt0 = 1, +} +impl ToSql for StorageLLMVersion { + fn to_sql(&self) -> rusqlite::Result> { + Ok((*self as i64).into()) + } +} +impl FromSql for StorageLLMVersion { + fn column_result(value: rusqlite::types::ValueRef<'_>) -> rusqlite::types::FromSqlResult { + let v = value.as_i64()?; + Self::try_from(v).map_err(|_| rusqlite::types::FromSqlError::OutOfRange(v)) + } +} + // This is mostly meant as a way to cache all of these. #[derive(Serialize, Debug, PartialEq, Clone)] pub struct ParsedLLMStorageResult { @@ -623,6 +689,7 @@ pub struct ParsedLLMStorageResult { pub quantity: i64, pub gigabytes: i64, pub fail_reason: String, + pub llm_id: StorageLLMVersion, } impl DBTable for ParsedLLMStorageResult { const TABLE_NAME: &'static str = "ParsedLLMStorageResult"; @@ -632,12 +699,13 @@ impl DBTable for ParsedLLMStorageResult { title TEXT NOT NULL, quantity INTEGER NOT NULL, gigabytes INTEGER NOT NULL, - fail_reason TEXT NOT NULL + fail_reason TEXT NOT NULL, + llm_id INTEGER NOT NULL "; fn get_all(conn: &Connection) -> rusqlite::Result> { let mut stmt = conn.prepare(&format!( - "SELECT id, item_id, title, quantity, gigabytes, fail_reason FROM {}", + "SELECT id, item_id, title, quantity, gigabytes, fail_reason, llm_id FROM {}", Self::TABLE_NAME ))?; let iter = stmt.query_map([], |row| { @@ -648,6 +716,7 @@ impl DBTable for ParsedLLMStorageResult { quantity: row.get(3)?, gigabytes: row.get(4)?, fail_reason: row.get(5)?, + llm_id: row.get(6)?, }) })?; @@ -674,6 +743,7 @@ impl ParsedLLMStorageResult { quantity: row.get(3)?, gigabytes: row.get(4)?, fail_reason: row.get(5)?, + llm_id: row.get(6)?, }) }) .ok() @@ -689,9 +759,10 @@ impl ParsedLLMStorageResult { title, quantity, gigabytes, - fail_reason + fail_reason, + llm_id ) - VALUES (?1, ?2, ?3, ?4, ?5)", + VALUES (?1, ?2, ?3, ?4, ?5, ?6)", Self::TABLE_NAME ), ( @@ -700,6 +771,7 @@ impl ParsedLLMStorageResult { self.quantity, self.gigabytes, self.fail_reason.clone(), + self.llm_id, ), ) .unwrap(); @@ -733,7 +805,7 @@ pub struct Stats { rows_parsed_storage: i64, rows_parsed_page: i64, rows_item_appearances: i64, - // pub rows_parsed_storage_llm: i64, + pub rows_parsed_storage_llm: i64, } pub fn get_stats(conn: &Connection) -> Stats { @@ -743,7 +815,7 @@ pub fn get_stats(conn: &Connection) -> Stats { rows_parsed_storage: ParsedStorage::get_count(conn), rows_parsed_page: ParsedPage::get_count(conn), rows_item_appearances: ItemAppearances::get_count(conn), - // rows_parsed_storage_llm: ParsedLLMStorageResult::get_count(conn), + rows_parsed_storage_llm: ParsedLLMStorageResult::get_count(conn), } } @@ -765,7 +837,7 @@ mod tests { let listing = Listing { id: 1, item_id: 1234, - title: "Some Title".to_string(), + title: "Lot of 2 512GB SSD 6gb/s working with 5% wear".to_string(), buy_it_now_price_cents: Some(123), has_best_offer: false, image_url: "google.com".to_string(), @@ -779,8 +851,8 @@ mod tests { total_gigabytes: 13, quantity: 3, individual_size_gigabytes: 13, - parse_engine: 9, - needed_description_check: true, + parse_engine: StorageParsingEngineVersion::Testing, + failed_reason: "".to_owned(), }; parsed.add_or_update(&db); assert_eq!(ParsedStorage::lookup(&db, listing.item_id), vec![parsed]); @@ -811,6 +883,7 @@ mod tests { item_id: 12345, quantity: 32, title: "Some Title".to_owned(), + llm_id: StorageLLMVersion::Testing, }; parsedllmstorage.add_or_update(&db); assert_eq!( diff --git a/src/main.rs b/src/main.rs index 0c588b7..31deebd 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,12 +1,11 @@ use actix_web::{App, HttpServer, Responder, Result, get, post, web, web::Data}; use chrono::{DateTime, Utc}; use clap::Parser; -use ebay_scraper_rust::db::{ - DBTable, ItemAppearances, Listing, ParsedLLMStorageResult, ParsedPage, ParsedStorage, - SearchURL, get_initialized, get_stats, listings_get_filtered, -}; +use ebay_scraper_rust::db; +use ebay_scraper_rust::db::DBTable; use ebay_scraper_rust::parser::parse_dir; use ebay_scraper_rust::{parser_storage_e0, parser_storage_e1}; +use futures::future::join_all; use serde::{Deserialize, Serialize}; use std::path::PathBuf; use std::sync::Mutex; @@ -32,6 +31,7 @@ struct AppCtx { db: rusqlite::Connection, db_llm: rusqlite::Connection, download_dir: PathBuf, + llm_parser: Option>, } #[derive(Deserialize, Debug)] @@ -47,7 +47,7 @@ async fn listings_filtered_get( filter: web::Query, ) -> Result { let start = Instant::now(); - let res = listings_get_filtered( + let res = db::listings_get_filtered( &ctx.lock().unwrap().db, &DateTime::::from_timestamp(filter.since.unwrap_or(0), 0).unwrap(), filter.limit.unwrap_or(1_000), @@ -64,12 +64,12 @@ async fn listings_filtered_get( #[get("/listing/{id}")] async fn listing_get(ctx: Data>, id: web::Path) -> Result { - Ok(web::Json(Listing::lookup(&ctx.lock().unwrap().db, *id))) + Ok(web::Json(db::Listing::lookup(&ctx.lock().unwrap().db, *id))) } #[get("/listing/{id}/parsed")] async fn listing_parse_get(ctx: Data>, id: web::Path) -> Result { - Ok(web::Json(ParsedStorage::lookup( + Ok(web::Json(db::ParsedStorage::lookup( &ctx.lock().unwrap().db, *id, ))) @@ -86,7 +86,7 @@ async fn listing_history_get( ctx: Data>, id: web::Path, ) -> Result { - let history: Vec<_> = ItemAppearances::lookup(&ctx.lock().unwrap().db, *id) + let history: Vec<_> = db::ItemAppearances::lookup(&ctx.lock().unwrap().db, *id) .iter() // .inspect(|e| info!("got: {:?}", e)) .filter_map(|e| { @@ -99,29 +99,85 @@ async fn listing_history_get( Ok(web::Json(history)) } +async fn storage_parse_work(entries: &[(i64, String)]) -> Vec { + let llm_futures: Vec<_> = entries + .iter() + .map(|(id, title)| parser_storage_e1::parse_size_and_quantity_llm(*id, title)) + .collect(); + let llm_future_results = join_all(llm_futures).await; + let llm_results = llm_future_results + .iter() + .flatten() + .map(|e| db::ParsedStorage { + id: 0, + item: e.item_id, + total_gigabytes: e.quantity * e.gigabytes, + quantity: e.quantity, + individual_size_gigabytes: e.gigabytes, + failed_reason: e.fail_reason.clone(), + parse_engine: db::StorageParsingEngineVersion::LLM, + }); + // .inspect(|e| e.add_or_update(&unlocked.db)) + // .map(|e| db::ParsedStorage { + // id: 0, + // item: e.item_id, + // total_gigabytes: e.quantity * e.gigabytes, + // quantity: e.quantity, + // individual_size_gigabytes: e.gigabytes, + // needed_description_check: !e.fail_reason.is_empty(), + // parse_engine: db::StorageParsingEngineVersion::LLM, + // }) + // .for_each(|e| e.add_or_update(&unlocked.db)); + + // And a regex based parse. + let regex_results = entries + .iter() + .map(|(id, title)| parser_storage_e0::parse_size_and_quantity(*id, &title)); + // .for_each(|e| e.add_or_update(&unlocked.db)); + + regex_results.chain(llm_results).collect() +} + +fn storage_parse_worker(ctx: Data>) -> actix_web::rt::task::JoinHandle<()> { + actix_web::rt::spawn(async move { + loop { + actix_web::rt::time::sleep(std::time::Duration::from_millis(1000)).await; + let ctx_unlocked = ctx.lock().unwrap(); + let entries = db::Listing::lookup_pending_parse(&ctx_unlocked.db, &[], 10); + let parsed = storage_parse_work(entries.as_slice()).await; + for p in parsed { + p.add_or_update(&ctx_unlocked.db); + } + } + }) +} + #[post("/listing/parse")] async fn parse_listings(ctx: Data>) -> Result { + // Prepare a background parser to go through and use an LLM to parse the + // storage info. + if ctx.lock().unwrap().llm_parser.is_none() { + ctx.clone().lock().unwrap().llm_parser = Some(storage_parse_worker(ctx.clone())); + } + // Lets grab a few entries and then try parsing them with two engines. let ctx_locked = ctx.lock().unwrap(); - let entries: Vec<_> = Listing::lookup_non_parsed(&ctx_locked.db) + let entries: Vec<_> = db::Listing::lookup_pending_parse(&ctx_locked.db, &[], 100) .iter() .take(10) .map(|e| e.clone()) .collect(); for (item_id, title) in &entries { - let ps0 = parser_storage_e0::parse_size_and_quantity(*item_id, &title); - ps0.add_or_update(&ctx_locked.db); - let ps1 = parser_storage_e1::parse_size_and_quantity(&ctx_locked.db_llm, *item_id, &title).await; if ps1.is_some() { - info!( - "Parsed using an LLM title:{} and results:{:?}", - title, - ps1.unwrap() - ); + // info!( + // "Parsed using an LLM title:{} and results:{:?}", + // title, + // ps1.unwrap() + // ); ps1.unwrap().add_or_update(&ctx_locked.db); - ps1.unwrap().add_or_update(&ctx_locked.db_llm); + // ps1.unwrap().add_or_update(&ctx_locked.db_llm); No need } else { error!("Failed to parse {item_id} with title {title}"); } @@ -131,7 +187,7 @@ async fn parse_listings(ctx: Data>) -> Result { #[get("/category")] async fn category_getnames(ctx: Data>) -> Result { - Ok(web::Json(SearchURL::names(&ctx.lock().unwrap().db))) + Ok(web::Json(db::SearchURL::names(&ctx.lock().unwrap().db))) } #[post("/category/{category}/parse")] @@ -156,9 +212,9 @@ async fn category_parse( #[get("/stats")] async fn stats_get(ctx: Data>) -> Result { - let stats_db = get_stats(&ctx.lock().unwrap().db); - // let stats_db_llm = get_stats(&ctx.lock().unwrap().db_llm); - // stats_db.rows_parsed_storage_llm = stats_db_llm.rows_parsed_storage_llm; + let mut stats_db = db::get_stats(&ctx.lock().unwrap().db); + let stats_db_llm = db::get_stats(&ctx.lock().unwrap().db_llm); + stats_db.rows_parsed_storage_llm = stats_db_llm.rows_parsed_storage_llm; Ok(web::Json(stats_db)) } @@ -166,11 +222,11 @@ async fn stats_get(ctx: Data>) -> Result { async fn admin_get(ctx: Data>) -> Result { let ctx_locked = ctx.lock().unwrap(); let query_start_time = Instant::now(); - let search_urls = SearchURL::get_all(&ctx_locked.db).unwrap_or_default(); - let parsed_pages = ParsedPage::get_all(&ctx_locked.db).unwrap_or_default(); - let parsed_storages = ParsedStorage::get_all(&ctx_locked.db).unwrap_or_default(); - let item_appearances = ItemAppearances::get_all(&ctx_locked.db).unwrap_or_default(); - let listings = Listing::get_all(&ctx_locked.db).unwrap_or_default(); + let search_urls = db::SearchURL::get_all(&ctx_locked.db).unwrap_or_default(); + let parsed_pages = db::ParsedPage::get_all(&ctx_locked.db).unwrap_or_default(); + let parsed_storages = db::ParsedStorage::get_all(&ctx_locked.db).unwrap_or_default(); + let item_appearances = db::ItemAppearances::get_all(&ctx_locked.db).unwrap_or_default(); + let listings = db::Listing::get_all(&ctx_locked.db).unwrap_or_default(); let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0; let html_gen_start_time = Instant::now(); @@ -313,17 +369,19 @@ async fn main() -> std::io::Result<()> { let app_data = Data::new(Mutex::new(AppCtx { download_dir: scrapedatadir.clone(), - db: get_initialized(None), + db: db::get_initialized(None), db_llm: { let db_path = scrapedatadir.with_file_name("llm.sqlite"); - let db = rusqlite::Connection::open(db_path).unwrap(); - ParsedLLMStorageResult::initialize(&db); + let db = rusqlite::Connection::open(&db_path).unwrap(); + db::ParsedLLMStorageResult::initialize(&db); + info!("Created {:?} for caching LLM parsed title.", db_path); db }, + llm_parser: None, })); // Prepare our backend via pulling in what catagories we are preconfigured with. - SearchURL::scan(&app_data.lock().unwrap().db, &scrapedatadir, "url.json"); + db::SearchURL::scan(&app_data.lock().unwrap().db, &scrapedatadir, "url.json"); HttpServer::new(move || { App::new() diff --git a/src/parser_storage_e0.rs b/src/parser_storage_e0.rs index a41d01d..8abfe46 100644 --- a/src/parser_storage_e0.rs +++ b/src/parser_storage_e0.rs @@ -32,7 +32,7 @@ pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage { let upper_title = title.to_uppercase(); let mut total_gb = 0i64; let mut quantity = 1i64; - let mut needed_description_check = false; + let mut failed_reason = String::new(); let mut individual_size_gb = 0i64; for pattern in EXPLICIT_QTY_PATTERNS.iter() { @@ -68,36 +68,35 @@ pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage { if !unique_sizes_gb.is_empty() { individual_size_gb = unique_sizes_gb[0]; if unique_sizes_gb.len() > 1 { - needed_description_check = true; + failed_reason = "Mixed Sizes".to_owned(); } } } if SIZE_RANGE_REGEX.is_match(&upper_title) { - needed_description_check = true; + failed_reason = "No Size Given".to_owned(); } if quantity > 1 && upper_title.contains("MIXED") { - needed_description_check = true; + failed_reason = "Mixed Sizes".to_owned(); } if upper_title.contains("CHECK THE DESCRIPTION") || upper_title.contains("CHECK DESCRIPTION") || upper_title.contains("SEE DESCRIPTION") { if quantity > 1 || size_matches.is_empty() || size_matches.len() > 1 { - needed_description_check = true; + failed_reason = "Mixed Sizes".to_owned(); } } + if upper_title.contains("READ") { + failed_reason = "Mixed Sizes".to_owned(); + } if individual_size_gb > 0 { total_gb = individual_size_gb * quantity; } if quantity > 1 && total_gb == 0 && !size_matches.is_empty() { - needed_description_check = true; - } - - if quantity == 1 && size_matches.len() == 1 && !needed_description_check { - // This condition is implicitly handled + failed_reason = "No size given".to_owned(); } ParsedStorage { @@ -106,8 +105,8 @@ pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage { total_gigabytes: total_gb, quantity, individual_size_gigabytes: individual_size_gb, - needed_description_check, - parse_engine: 0, + failed_reason: failed_reason, + parse_engine: crate::db::StorageParsingEngineVersion::Regex, } } @@ -125,8 +124,8 @@ mod tests { total_gigabytes: 512 * 3, quantity: 3, individual_size_gigabytes: 512, - parse_engine: 0, - needed_description_check: false, + parse_engine: crate::db::StorageParsingEngineVersion::Regex, + failed_reason: String::new(), }, true, ), @@ -138,8 +137,8 @@ mod tests { total_gigabytes: 240, quantity: 1, individual_size_gigabytes: 240, - parse_engine: 0, - needed_description_check: false, + parse_engine: crate::db::StorageParsingEngineVersion::Regex, + failed_reason: String::new(), }, true, ), @@ -151,8 +150,8 @@ mod tests { total_gigabytes: 1024, quantity: 1, individual_size_gigabytes: 1024, - parse_engine: 0, - needed_description_check: true, + parse_engine: crate::db::StorageParsingEngineVersion::Regex, + failed_reason: String::new(), }, false, // Sadly this one fails :/ ), @@ -164,8 +163,8 @@ mod tests { total_gigabytes: 7 * 1024, quantity: 1, individual_size_gigabytes: 7 * 1024, - parse_engine: 0, - needed_description_check: false, + parse_engine: crate::db::StorageParsingEngineVersion::Regex, + failed_reason: String::new(), }, true, ), @@ -177,8 +176,8 @@ mod tests { total_gigabytes: 6 * 256, quantity: 6, individual_size_gigabytes: 256, - parse_engine: 0, - needed_description_check: false, + parse_engine: crate::db::StorageParsingEngineVersion::Regex, + failed_reason: String::new(), }, true, ), @@ -190,8 +189,8 @@ mod tests { total_gigabytes: 1966, quantity: 1, individual_size_gigabytes: 1966, - parse_engine: 0, - needed_description_check: false, + parse_engine: crate::db::StorageParsingEngineVersion::Regex, + failed_reason: String::new(), }, true, ), diff --git a/src/parser_storage_e1.rs b/src/parser_storage_e1.rs index f920e55..9a56a98 100644 --- a/src/parser_storage_e1.rs +++ b/src/parser_storage_e1.rs @@ -1,10 +1,10 @@ -use crate::db::ParsedLLMStorageResult; -use crate::db::ParsedStorage; -use actix_web::mime::APPLICATION_JSON; -use reqwest::header::AUTHORIZATION; -use reqwest::header::CONTENT_TYPE; +use crate::db::{ + ParsedLLMStorageResult, ParsedStorage, StorageLLMVersion, StorageParsingEngineVersion, +}; +use reqwest::header::{AUTHORIZATION, CONTENT_TYPE}; use serde::{Deserialize, Serialize}; use serde_json::json; +use tracing::error; // Given this prompt and a string of "(Lot of 6) Samsung MZ-VLB2560 256GB M.2NVMe Internal SSD // (MZVLB256HBHQ-000H1)", we get 338 input tokens and 36 output tokens. Assuming no caching, then @@ -28,22 +28,6 @@ And an example for an unclear title of "Pallet of assorted 128GB to 5TB drives"; } "#; -fn create_request(title: &str) -> serde_json::Value { - json!({ - "model": "gemini-2.5-flash-lite", - "messages": [ - { - "role": "system", - "content": SYSTEM_PROMPT - }, - { - "role": "user", - "content": title - } - ] - }) -} - #[derive(Deserialize, Serialize, Debug, PartialEq, Clone)] struct LLMParsedResponse { pub quantity: i64, @@ -66,40 +50,111 @@ struct OpenAIMessage { content: String, } -/// Parses size and quantity information from an item title. -pub async fn parse_size_and_quantity( - db: &rusqlite::Connection, +#[cfg(test)] +const OPENAI_LLM_URL: &str = "https://badurl.hak8or.com/litellm_api/chat/completions"; +#[cfg(not(test))] +const OPENAI_LLM_URL: &str = "https://ai.hak8or.com/litellm_api/chat/completions"; + +#[cfg(test)] +const OPENAI_LLM_API_KEY: &str = "Bearer sk-YmVlcC1ib29wLWEtcm9ib3Q"; +#[cfg(not(test))] +const OPENAI_LLM_API_KEY: &str = "Bearer sk-HMGML94x2ag6ggOoDghSGA"; + +pub async fn parse_size_and_quantity_llm( item_id: i64, title: &str, -) -> Option { +) -> Option { let client = reqwest::Client::new(); let req = client - .post("https://ai.hak8or.com/litellm_api/chat/completions") - .header(CONTENT_TYPE, APPLICATION_JSON.to_string()) - .header(AUTHORIZATION, "Bearer sk-HMGML94x2ag6ggOoDghSGA") - .body(create_request(title).to_string()); + .post(OPENAI_LLM_URL) + .header(CONTENT_TYPE, actix_web::mime::APPLICATION_JSON.to_string()) + .header(AUTHORIZATION, OPENAI_LLM_API_KEY) + .body( + json!({ + "model": "gemini-2.5-flash-lite", + "reasoning_effort": "disable", + "thinking": {"type": "disabled", "budget_tokens": 0}, + "messages": [ + { "role": "system", "content": SYSTEM_PROMPT }, + { "role": "user", "content": title } + ] + }) + .to_string(), + ); let reply_body = req.send().await.ok()?.text().await.ok()?; let repl_json: OpenAIResponse = serde_json::from_str(&reply_body).ok()?; + match repl_json.choices.len() { + 0 => { + error!("When parsing title, LLM returned ZERO choices"); + return None; + } + 1 => { /* Nothing to do */ } + a => error!("When parsing title, LLM returned {a}, >1 choices, using first!"), + } let reply_parsed_storage_json: LLMParsedResponse = serde_json::from_str(&repl_json.choices[0].message.content).ok()?; - let plsr = ParsedLLMStorageResult { + if !reply_parsed_storage_json.fail_reason.is_empty() { + error!( + "Failed parsing item_id:{item_id}, title:{title}, due to reason:{}", + reply_parsed_storage_json.fail_reason + ); + } + + Some(ParsedLLMStorageResult { id: 0, fail_reason: reply_parsed_storage_json.fail_reason.clone(), gigabytes: reply_parsed_storage_json.gigabytes, item_id, quantity: reply_parsed_storage_json.quantity, title: title.to_owned(), - }; + llm_id: StorageLLMVersion::Gemini2d5Prompt0, + }) +} + +// Since we can't have a hashmap in a const, and I don't want to play with +// making our parsed result struct contain a CoW string for fail_reason and +// title, we are stuck with this ... +pub fn parse_cached(item_id: i64, title: &str) -> Option { + match title { + "Lot of 2 512GB SSD 6gb/s working with 5% wear" => Some(ParsedLLMStorageResult { + id: 0, + item_id: item_id, + fail_reason: "".to_string(), + gigabytes: 512, + quantity: 2, + title: title.to_owned(), + llm_id: StorageLLMVersion::Testing, + }), + "Lot of 2 assorted SSD" => Some(ParsedLLMStorageResult { + id: 0, + fail_reason: "mixed sizes".to_owned(), + gigabytes: 0, + item_id, + quantity: 0, + title: title.to_owned(), + llm_id: StorageLLMVersion::Testing, + }), + _ => None, + } +} + +/// Parses size and quantity information from an item title. +pub async fn parse_size_and_quantity( + db: &rusqlite::Connection, + item_id: i64, + title: &str, +) -> Option { + let plsr = parse_size_and_quantity_llm(item_id, title).await?; plsr.add_or_update(&db); Some(ParsedStorage { id: 0, item: item_id, - total_gigabytes: reply_parsed_storage_json.quantity * reply_parsed_storage_json.gigabytes, - quantity: reply_parsed_storage_json.quantity, - individual_size_gigabytes: reply_parsed_storage_json.gigabytes, - needed_description_check: !reply_parsed_storage_json.fail_reason.is_empty(), - parse_engine: 1, + total_gigabytes: plsr.quantity * plsr.gigabytes, + quantity: plsr.quantity, + individual_size_gigabytes: plsr.gigabytes, + failed_reason: plsr.fail_reason, + parse_engine: StorageParsingEngineVersion::LLM, }) }