Moaaarrr
All checks were successful
Cargo Build & Test / Rust project - latest (1.86) (push) Successful in 4m26s
Cargo Build & Test / Rust project - latest (1.87) (push) Successful in 4m42s
Cargo Build & Test / Rust project - latest (1.85.1) (push) Successful in 5m15s
Cargo Build & Test / Rust project - latest (1.88) (push) Successful in 11m40s

This commit is contained in:
2025-09-09 00:31:23 -04:00
parent 4ae1622f02
commit 0039078f41
6 changed files with 416 additions and 131 deletions

98
Cargo.lock generated
View File

@@ -695,7 +695,9 @@ dependencies = [
"chrono", "chrono",
"clap", "clap",
"dirs", "dirs",
"futures",
"lazy_static", "lazy_static",
"num_enum",
"rayon", "rayon",
"regex", "regex",
"reqwest", "reqwest",
@@ -848,6 +850,21 @@ dependencies = [
"new_debug_unreachable", "new_debug_unreachable",
] ]
[[package]]
name = "futures"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
dependencies = [
"futures-channel",
"futures-core",
"futures-executor",
"futures-io",
"futures-sink",
"futures-task",
"futures-util",
]
[[package]] [[package]]
name = "futures-channel" name = "futures-channel"
version = "0.3.31" version = "0.3.31"
@@ -864,12 +881,34 @@ version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
[[package]]
name = "futures-executor"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f"
dependencies = [
"futures-core",
"futures-task",
"futures-util",
]
[[package]] [[package]]
name = "futures-io" name = "futures-io"
version = "0.3.31" version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
[[package]]
name = "futures-macro"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]] [[package]]
name = "futures-sink" name = "futures-sink"
version = "0.3.31" version = "0.3.31"
@@ -888,8 +927,10 @@ version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
dependencies = [ dependencies = [
"futures-channel",
"futures-core", "futures-core",
"futures-io", "futures-io",
"futures-macro",
"futures-sink", "futures-sink",
"futures-task", "futures-task",
"memchr", "memchr",
@@ -1566,6 +1607,28 @@ dependencies = [
"autocfg", "autocfg",
] ]
[[package]]
name = "num_enum"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a973b4e44ce6cad84ce69d797acf9a044532e4184c4f267913d1b546a0727b7a"
dependencies = [
"num_enum_derive",
"rustversion",
]
[[package]]
name = "num_enum_derive"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77e878c846a8abae00dd069496dbe8751b16ac1c3d6bd2a7283a938e8228f90d"
dependencies = [
"proc-macro-crate",
"proc-macro2",
"quote",
"syn",
]
[[package]] [[package]]
name = "object" name = "object"
version = "0.36.7" version = "0.36.7"
@@ -1772,6 +1835,15 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "proc-macro-crate"
version = "3.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edce586971a4dfaa28950c6f18ed55e0406c1ab88bbce2c6f6293a7aaba73d35"
dependencies = [
"toml_edit",
]
[[package]] [[package]]
name = "proc-macro2" name = "proc-macro2"
version = "1.0.95" version = "1.0.95"
@@ -2534,6 +2606,23 @@ dependencies = [
"tokio", "tokio",
] ]
[[package]]
name = "toml_datetime"
version = "0.6.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c"
[[package]]
name = "toml_edit"
version = "0.22.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a"
dependencies = [
"indexmap",
"toml_datetime",
"winnow",
]
[[package]] [[package]]
name = "tower" name = "tower"
version = "0.5.2" version = "0.5.2"
@@ -3009,6 +3098,15 @@ version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "winnow"
version = "0.7.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "21a0236b59786fed61e2a80582dd500fe61f18b5dca67a4a067d0bc9039339cf"
dependencies = [
"memchr",
]
[[package]] [[package]]
name = "wit-bindgen-rt" name = "wit-bindgen-rt"
version = "0.39.0" version = "0.39.0"

View File

@@ -8,7 +8,9 @@ actix-web = "4.11.0"
chrono = { version = "0.4.41", features = ["serde"] } chrono = { version = "0.4.41", features = ["serde"] }
clap = { version = "4.5.40", features = ["derive"] } clap = { version = "4.5.40", features = ["derive"] }
dirs = "6.0.0" dirs = "6.0.0"
futures = "0.3.31"
lazy_static = "1.5.0" lazy_static = "1.5.0"
num_enum = "0.7.4"
rayon = "1.10.0" rayon = "1.10.0"
regex = "1.11.1" regex = "1.11.1"
reqwest = { version = "0.12.23", features = ["blocking"] } reqwest = { version = "0.12.23", features = ["blocking"] }

135
src/db.rs
View File

@@ -1,5 +1,8 @@
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use num_enum::TryFromPrimitive;
use rusqlite::Connection; use rusqlite::Connection;
use rusqlite::ToSql;
use rusqlite::types::FromSql;
use serde::Deserialize; use serde::Deserialize;
use serde::Serialize; use serde::Serialize;
use std::path::Path; use std::path::Path;
@@ -205,15 +208,34 @@ impl ParsedPage {
} }
} }
#[derive(Serialize, Debug, PartialEq, Copy, Clone)] #[repr(i64)]
#[derive(Serialize, Debug, PartialEq, Copy, Clone, PartialOrd, Ord, Eq, TryFromPrimitive)]
pub enum StorageParsingEngineVersion {
Testing = 0,
Regex = 1,
LLM = 2,
}
impl ToSql for StorageParsingEngineVersion {
fn to_sql(&self) -> rusqlite::Result<rusqlite::types::ToSqlOutput<'_>> {
Ok((*self as i64).into())
}
}
impl FromSql for StorageParsingEngineVersion {
fn column_result(value: rusqlite::types::ValueRef<'_>) -> rusqlite::types::FromSqlResult<Self> {
let v = value.as_i64()?;
Self::try_from(v).map_err(|_| rusqlite::types::FromSqlError::OutOfRange(v))
}
}
#[derive(Serialize, Debug, PartialEq, Clone)]
pub struct ParsedStorage { pub struct ParsedStorage {
pub id: i64, pub id: i64,
pub item: i64, pub item: i64,
pub total_gigabytes: i64, pub total_gigabytes: i64,
pub quantity: i64, pub quantity: i64,
pub individual_size_gigabytes: i64, pub individual_size_gigabytes: i64,
pub parse_engine: i64, pub parse_engine: StorageParsingEngineVersion,
pub needed_description_check: bool, pub failed_reason: String,
} }
impl DBTable for ParsedStorage { impl DBTable for ParsedStorage {
const TABLE_NAME: &'static str = "Storage_Parsed"; const TABLE_NAME: &'static str = "Storage_Parsed";
@@ -224,13 +246,13 @@ impl DBTable for ParsedStorage {
quantity INTEGER, quantity INTEGER,
sizes_gigabytes TEXT, sizes_gigabytes TEXT,
parse_engine INTEGER, parse_engine INTEGER,
need_description_check INTEGER, failed_reason TEXT,
UNIQUE(item, parse_engine) UNIQUE(item, parse_engine)
FOREIGN KEY(item) REFERENCES Listings(item_id) FOREIGN KEY(item) REFERENCES Listings(item_id)
"; ";
fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> { fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> {
let mut stmt = conn.prepare(&format!("SELECT id, item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, need_description_check FROM {}", Self::TABLE_NAME))?; let mut stmt = conn.prepare(&format!("SELECT id, item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, failed_reason FROM {}", Self::TABLE_NAME))?;
let iter = stmt.query_map([], |row| { let iter = stmt.query_map([], |row| {
Ok(ParsedStorage { Ok(ParsedStorage {
id: row.get(0)?, id: row.get(0)?,
@@ -242,7 +264,7 @@ impl DBTable for ParsedStorage {
r.parse().unwrap_or(0) r.parse().unwrap_or(0)
}, },
parse_engine: row.get(5)?, parse_engine: row.get(5)?,
needed_description_check: row.get(6)?, failed_reason: row.get(6)?,
}) })
})?; })?;
@@ -273,7 +295,7 @@ impl ParsedStorage {
r.parse().unwrap() r.parse().unwrap()
}, },
parse_engine: row.get(5)?, parse_engine: row.get(5)?,
needed_description_check: row.get(6)?, failed_reason: row.get(6)?,
}) })
}) })
.ok() .ok()
@@ -283,21 +305,26 @@ impl ParsedStorage {
} }
pub fn add_or_update(&self, conn: &Connection) { pub fn add_or_update(&self, conn: &Connection) {
let _ = conn.execute(&format!(" let _ = conn
.execute(
&format!(
"
INSERT OR REPLACE INTO {} INSERT OR REPLACE INTO {}
(item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, need_description_check) (item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, failed_reason)
VALUES VALUES
(?1, ?2, ?3, ?4, ?5, ?6)", (?1, ?2, ?3, ?4, ?5, ?6)",
Self::TABLE_NAME), Self::TABLE_NAME
),
( (
&self.item, &self.item,
self.total_gigabytes, self.total_gigabytes,
self.quantity, self.quantity,
self.individual_size_gigabytes.to_string(), self.individual_size_gigabytes.to_string(),
self.parse_engine, self.parse_engine,
self.needed_description_check &self.failed_reason,
),
) )
).unwrap(); .unwrap();
} }
} }
@@ -494,19 +521,40 @@ impl Listing {
.collect() .collect()
} }
pub fn lookup_non_parsed(conn: &Connection) -> Vec<(i64, String)> { pub fn lookup_pending_parse(
let mut stmt = conn conn: &Connection,
.prepare(&format!( allowed_engines: &[i64],
count_limit: u64,
) -> Vec<(i64, String)> {
let engines_filter = if !allowed_engines.is_empty() {
format!(
"AND ({})",
allowed_engines
.iter()
.map(|e| "ps.parse_engine = ".to_owned() + &e.to_string())
.collect::<Vec<_>>()
.join(" OR ")
)
} else {
String::new()
};
let query = format!(
" "
SELECT ei.item_id, ei.title FROM {} AS ei SELECT listing.item_id, listing.title FROM {0} AS listing
LEFT JOIN {} AS sp ON ei.item_id = sp.item WHERE NOT EXISTS (
WHERE sp.item IS NULL", SELECT 1 FROM {1} AS ps
WHERE listing.item_id = ps.item {engines_filter}
)
LIMIT {count_limit}
",
Self::TABLE_NAME, Self::TABLE_NAME,
ParsedStorage::TABLE_NAME ParsedStorage::TABLE_NAME
)) );
conn.prepare(&query)
.ok() .ok()
.unwrap(); .unwrap()
stmt.query_map([], |row| Ok((row.get(0)?, row.get(1)?))) .query_map([], |row| Ok((row.get(0)?, row.get(1)?)))
.ok() .ok()
.unwrap() .unwrap()
.map(|e| e.unwrap()) .map(|e| e.unwrap())
@@ -565,7 +613,7 @@ pub fn listings_get_filtered(
history: ItemAppearances::lookup(conn, l.item_id), history: ItemAppearances::lookup(conn, l.item_id),
parsed: ParsedStorage::lookup(conn, l.item_id), parsed: ParsedStorage::lookup(conn, l.item_id),
}) })
.filter(|lr| lr.parsed.iter().any(|p| !p.needed_description_check)) .filter(|lr| lr.parsed.iter().any(|p| p.failed_reason.is_empty()))
.collect::<Vec<ListingsFilterResult>>(); .collect::<Vec<ListingsFilterResult>>();
info!( info!(
"Found total {} listings since (str:{} epoch:{})", "Found total {} listings since (str:{} epoch:{})",
@@ -614,6 +662,24 @@ pub fn listings_get_filtered(
listings listings
} }
#[repr(i64)]
#[derive(Serialize, Debug, PartialEq, Copy, Clone, TryFromPrimitive)]
pub enum StorageLLMVersion {
Testing = 0,
Gemini2d5Prompt0 = 1,
}
impl ToSql for StorageLLMVersion {
fn to_sql(&self) -> rusqlite::Result<rusqlite::types::ToSqlOutput<'_>> {
Ok((*self as i64).into())
}
}
impl FromSql for StorageLLMVersion {
fn column_result(value: rusqlite::types::ValueRef<'_>) -> rusqlite::types::FromSqlResult<Self> {
let v = value.as_i64()?;
Self::try_from(v).map_err(|_| rusqlite::types::FromSqlError::OutOfRange(v))
}
}
// This is mostly meant as a way to cache all of these. // This is mostly meant as a way to cache all of these.
#[derive(Serialize, Debug, PartialEq, Clone)] #[derive(Serialize, Debug, PartialEq, Clone)]
pub struct ParsedLLMStorageResult { pub struct ParsedLLMStorageResult {
@@ -623,6 +689,7 @@ pub struct ParsedLLMStorageResult {
pub quantity: i64, pub quantity: i64,
pub gigabytes: i64, pub gigabytes: i64,
pub fail_reason: String, pub fail_reason: String,
pub llm_id: StorageLLMVersion,
} }
impl DBTable for ParsedLLMStorageResult { impl DBTable for ParsedLLMStorageResult {
const TABLE_NAME: &'static str = "ParsedLLMStorageResult"; const TABLE_NAME: &'static str = "ParsedLLMStorageResult";
@@ -632,12 +699,13 @@ impl DBTable for ParsedLLMStorageResult {
title TEXT NOT NULL, title TEXT NOT NULL,
quantity INTEGER NOT NULL, quantity INTEGER NOT NULL,
gigabytes INTEGER NOT NULL, gigabytes INTEGER NOT NULL,
fail_reason TEXT NOT NULL fail_reason TEXT NOT NULL,
llm_id INTEGER NOT NULL
"; ";
fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> { fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> {
let mut stmt = conn.prepare(&format!( let mut stmt = conn.prepare(&format!(
"SELECT id, item_id, title, quantity, gigabytes, fail_reason FROM {}", "SELECT id, item_id, title, quantity, gigabytes, fail_reason, llm_id FROM {}",
Self::TABLE_NAME Self::TABLE_NAME
))?; ))?;
let iter = stmt.query_map([], |row| { let iter = stmt.query_map([], |row| {
@@ -648,6 +716,7 @@ impl DBTable for ParsedLLMStorageResult {
quantity: row.get(3)?, quantity: row.get(3)?,
gigabytes: row.get(4)?, gigabytes: row.get(4)?,
fail_reason: row.get(5)?, fail_reason: row.get(5)?,
llm_id: row.get(6)?,
}) })
})?; })?;
@@ -674,6 +743,7 @@ impl ParsedLLMStorageResult {
quantity: row.get(3)?, quantity: row.get(3)?,
gigabytes: row.get(4)?, gigabytes: row.get(4)?,
fail_reason: row.get(5)?, fail_reason: row.get(5)?,
llm_id: row.get(6)?,
}) })
}) })
.ok() .ok()
@@ -689,9 +759,10 @@ impl ParsedLLMStorageResult {
title, title,
quantity, quantity,
gigabytes, gigabytes,
fail_reason fail_reason,
llm_id
) )
VALUES (?1, ?2, ?3, ?4, ?5)", VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
Self::TABLE_NAME Self::TABLE_NAME
), ),
( (
@@ -700,6 +771,7 @@ impl ParsedLLMStorageResult {
self.quantity, self.quantity,
self.gigabytes, self.gigabytes,
self.fail_reason.clone(), self.fail_reason.clone(),
self.llm_id,
), ),
) )
.unwrap(); .unwrap();
@@ -733,7 +805,7 @@ pub struct Stats {
rows_parsed_storage: i64, rows_parsed_storage: i64,
rows_parsed_page: i64, rows_parsed_page: i64,
rows_item_appearances: i64, rows_item_appearances: i64,
// pub rows_parsed_storage_llm: i64, pub rows_parsed_storage_llm: i64,
} }
pub fn get_stats(conn: &Connection) -> Stats { pub fn get_stats(conn: &Connection) -> Stats {
@@ -743,7 +815,7 @@ pub fn get_stats(conn: &Connection) -> Stats {
rows_parsed_storage: ParsedStorage::get_count(conn), rows_parsed_storage: ParsedStorage::get_count(conn),
rows_parsed_page: ParsedPage::get_count(conn), rows_parsed_page: ParsedPage::get_count(conn),
rows_item_appearances: ItemAppearances::get_count(conn), rows_item_appearances: ItemAppearances::get_count(conn),
// rows_parsed_storage_llm: ParsedLLMStorageResult::get_count(conn), rows_parsed_storage_llm: ParsedLLMStorageResult::get_count(conn),
} }
} }
@@ -765,7 +837,7 @@ mod tests {
let listing = Listing { let listing = Listing {
id: 1, id: 1,
item_id: 1234, item_id: 1234,
title: "Some Title".to_string(), title: "Lot of 2 512GB SSD 6gb/s working with 5% wear".to_string(),
buy_it_now_price_cents: Some(123), buy_it_now_price_cents: Some(123),
has_best_offer: false, has_best_offer: false,
image_url: "google.com".to_string(), image_url: "google.com".to_string(),
@@ -779,8 +851,8 @@ mod tests {
total_gigabytes: 13, total_gigabytes: 13,
quantity: 3, quantity: 3,
individual_size_gigabytes: 13, individual_size_gigabytes: 13,
parse_engine: 9, parse_engine: StorageParsingEngineVersion::Testing,
needed_description_check: true, failed_reason: "".to_owned(),
}; };
parsed.add_or_update(&db); parsed.add_or_update(&db);
assert_eq!(ParsedStorage::lookup(&db, listing.item_id), vec![parsed]); assert_eq!(ParsedStorage::lookup(&db, listing.item_id), vec![parsed]);
@@ -811,6 +883,7 @@ mod tests {
item_id: 12345, item_id: 12345,
quantity: 32, quantity: 32,
title: "Some Title".to_owned(), title: "Some Title".to_owned(),
llm_id: StorageLLMVersion::Testing,
}; };
parsedllmstorage.add_or_update(&db); parsedllmstorage.add_or_update(&db);
assert_eq!( assert_eq!(

View File

@@ -1,12 +1,11 @@
use actix_web::{App, HttpServer, Responder, Result, get, post, web, web::Data}; use actix_web::{App, HttpServer, Responder, Result, get, post, web, web::Data};
use chrono::{DateTime, Utc}; use chrono::{DateTime, Utc};
use clap::Parser; use clap::Parser;
use ebay_scraper_rust::db::{ use ebay_scraper_rust::db;
DBTable, ItemAppearances, Listing, ParsedLLMStorageResult, ParsedPage, ParsedStorage, use ebay_scraper_rust::db::DBTable;
SearchURL, get_initialized, get_stats, listings_get_filtered,
};
use ebay_scraper_rust::parser::parse_dir; use ebay_scraper_rust::parser::parse_dir;
use ebay_scraper_rust::{parser_storage_e0, parser_storage_e1}; use ebay_scraper_rust::{parser_storage_e0, parser_storage_e1};
use futures::future::join_all;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::path::PathBuf; use std::path::PathBuf;
use std::sync::Mutex; use std::sync::Mutex;
@@ -32,6 +31,7 @@ struct AppCtx {
db: rusqlite::Connection, db: rusqlite::Connection,
db_llm: rusqlite::Connection, db_llm: rusqlite::Connection,
download_dir: PathBuf, download_dir: PathBuf,
llm_parser: Option<actix_web::rt::task::JoinHandle<()>>,
} }
#[derive(Deserialize, Debug)] #[derive(Deserialize, Debug)]
@@ -47,7 +47,7 @@ async fn listings_filtered_get(
filter: web::Query<ListingsFilter>, filter: web::Query<ListingsFilter>,
) -> Result<impl Responder> { ) -> Result<impl Responder> {
let start = Instant::now(); let start = Instant::now();
let res = listings_get_filtered( let res = db::listings_get_filtered(
&ctx.lock().unwrap().db, &ctx.lock().unwrap().db,
&DateTime::<Utc>::from_timestamp(filter.since.unwrap_or(0), 0).unwrap(), &DateTime::<Utc>::from_timestamp(filter.since.unwrap_or(0), 0).unwrap(),
filter.limit.unwrap_or(1_000), filter.limit.unwrap_or(1_000),
@@ -64,12 +64,12 @@ async fn listings_filtered_get(
#[get("/listing/{id}")] #[get("/listing/{id}")]
async fn listing_get(ctx: Data<Mutex<AppCtx>>, id: web::Path<i64>) -> Result<impl Responder> { async fn listing_get(ctx: Data<Mutex<AppCtx>>, id: web::Path<i64>) -> Result<impl Responder> {
Ok(web::Json(Listing::lookup(&ctx.lock().unwrap().db, *id))) Ok(web::Json(db::Listing::lookup(&ctx.lock().unwrap().db, *id)))
} }
#[get("/listing/{id}/parsed")] #[get("/listing/{id}/parsed")]
async fn listing_parse_get(ctx: Data<Mutex<AppCtx>>, id: web::Path<i64>) -> Result<impl Responder> { async fn listing_parse_get(ctx: Data<Mutex<AppCtx>>, id: web::Path<i64>) -> Result<impl Responder> {
Ok(web::Json(ParsedStorage::lookup( Ok(web::Json(db::ParsedStorage::lookup(
&ctx.lock().unwrap().db, &ctx.lock().unwrap().db,
*id, *id,
))) )))
@@ -86,7 +86,7 @@ async fn listing_history_get(
ctx: Data<Mutex<AppCtx>>, ctx: Data<Mutex<AppCtx>>,
id: web::Path<i64>, id: web::Path<i64>,
) -> Result<impl Responder> { ) -> Result<impl Responder> {
let history: Vec<_> = ItemAppearances::lookup(&ctx.lock().unwrap().db, *id) let history: Vec<_> = db::ItemAppearances::lookup(&ctx.lock().unwrap().db, *id)
.iter() .iter()
// .inspect(|e| info!("got: {:?}", e)) // .inspect(|e| info!("got: {:?}", e))
.filter_map(|e| { .filter_map(|e| {
@@ -99,29 +99,85 @@ async fn listing_history_get(
Ok(web::Json(history)) Ok(web::Json(history))
} }
async fn storage_parse_work(entries: &[(i64, String)]) -> Vec<db::ParsedStorage> {
let llm_futures: Vec<_> = entries
.iter()
.map(|(id, title)| parser_storage_e1::parse_size_and_quantity_llm(*id, title))
.collect();
let llm_future_results = join_all(llm_futures).await;
let llm_results = llm_future_results
.iter()
.flatten()
.map(|e| db::ParsedStorage {
id: 0,
item: e.item_id,
total_gigabytes: e.quantity * e.gigabytes,
quantity: e.quantity,
individual_size_gigabytes: e.gigabytes,
failed_reason: e.fail_reason.clone(),
parse_engine: db::StorageParsingEngineVersion::LLM,
});
// .inspect(|e| e.add_or_update(&unlocked.db))
// .map(|e| db::ParsedStorage {
// id: 0,
// item: e.item_id,
// total_gigabytes: e.quantity * e.gigabytes,
// quantity: e.quantity,
// individual_size_gigabytes: e.gigabytes,
// needed_description_check: !e.fail_reason.is_empty(),
// parse_engine: db::StorageParsingEngineVersion::LLM,
// })
// .for_each(|e| e.add_or_update(&unlocked.db));
// And a regex based parse.
let regex_results = entries
.iter()
.map(|(id, title)| parser_storage_e0::parse_size_and_quantity(*id, &title));
// .for_each(|e| e.add_or_update(&unlocked.db));
regex_results.chain(llm_results).collect()
}
fn storage_parse_worker(ctx: Data<Mutex<AppCtx>>) -> actix_web::rt::task::JoinHandle<()> {
actix_web::rt::spawn(async move {
loop {
actix_web::rt::time::sleep(std::time::Duration::from_millis(1000)).await;
let ctx_unlocked = ctx.lock().unwrap();
let entries = db::Listing::lookup_pending_parse(&ctx_unlocked.db, &[], 10);
let parsed = storage_parse_work(entries.as_slice()).await;
for p in parsed {
p.add_or_update(&ctx_unlocked.db);
}
}
})
}
#[post("/listing/parse")] #[post("/listing/parse")]
async fn parse_listings(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> { async fn parse_listings(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
// Prepare a background parser to go through and use an LLM to parse the
// storage info.
if ctx.lock().unwrap().llm_parser.is_none() {
ctx.clone().lock().unwrap().llm_parser = Some(storage_parse_worker(ctx.clone()));
}
// Lets grab a few entries and then try parsing them with two engines. // Lets grab a few entries and then try parsing them with two engines.
let ctx_locked = ctx.lock().unwrap(); let ctx_locked = ctx.lock().unwrap();
let entries: Vec<_> = Listing::lookup_non_parsed(&ctx_locked.db) let entries: Vec<_> = db::Listing::lookup_pending_parse(&ctx_locked.db, &[], 100)
.iter() .iter()
.take(10) .take(10)
.map(|e| e.clone()) .map(|e| e.clone())
.collect(); .collect();
for (item_id, title) in &entries { for (item_id, title) in &entries {
let ps0 = parser_storage_e0::parse_size_and_quantity(*item_id, &title);
ps0.add_or_update(&ctx_locked.db);
let ps1 = let ps1 =
parser_storage_e1::parse_size_and_quantity(&ctx_locked.db_llm, *item_id, &title).await; parser_storage_e1::parse_size_and_quantity(&ctx_locked.db_llm, *item_id, &title).await;
if ps1.is_some() { if ps1.is_some() {
info!( // info!(
"Parsed using an LLM title:{} and results:{:?}", // "Parsed using an LLM title:{} and results:{:?}",
title, // title,
ps1.unwrap() // ps1.unwrap()
); // );
ps1.unwrap().add_or_update(&ctx_locked.db); ps1.unwrap().add_or_update(&ctx_locked.db);
ps1.unwrap().add_or_update(&ctx_locked.db_llm); // ps1.unwrap().add_or_update(&ctx_locked.db_llm); No need
} else { } else {
error!("Failed to parse {item_id} with title {title}"); error!("Failed to parse {item_id} with title {title}");
} }
@@ -131,7 +187,7 @@ async fn parse_listings(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
#[get("/category")] #[get("/category")]
async fn category_getnames(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> { async fn category_getnames(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
Ok(web::Json(SearchURL::names(&ctx.lock().unwrap().db))) Ok(web::Json(db::SearchURL::names(&ctx.lock().unwrap().db)))
} }
#[post("/category/{category}/parse")] #[post("/category/{category}/parse")]
@@ -156,9 +212,9 @@ async fn category_parse(
#[get("/stats")] #[get("/stats")]
async fn stats_get(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> { async fn stats_get(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
let stats_db = get_stats(&ctx.lock().unwrap().db); let mut stats_db = db::get_stats(&ctx.lock().unwrap().db);
// let stats_db_llm = get_stats(&ctx.lock().unwrap().db_llm); let stats_db_llm = db::get_stats(&ctx.lock().unwrap().db_llm);
// stats_db.rows_parsed_storage_llm = stats_db_llm.rows_parsed_storage_llm; stats_db.rows_parsed_storage_llm = stats_db_llm.rows_parsed_storage_llm;
Ok(web::Json(stats_db)) Ok(web::Json(stats_db))
} }
@@ -166,11 +222,11 @@ async fn stats_get(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
async fn admin_get(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> { async fn admin_get(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
let ctx_locked = ctx.lock().unwrap(); let ctx_locked = ctx.lock().unwrap();
let query_start_time = Instant::now(); let query_start_time = Instant::now();
let search_urls = SearchURL::get_all(&ctx_locked.db).unwrap_or_default(); let search_urls = db::SearchURL::get_all(&ctx_locked.db).unwrap_or_default();
let parsed_pages = ParsedPage::get_all(&ctx_locked.db).unwrap_or_default(); let parsed_pages = db::ParsedPage::get_all(&ctx_locked.db).unwrap_or_default();
let parsed_storages = ParsedStorage::get_all(&ctx_locked.db).unwrap_or_default(); let parsed_storages = db::ParsedStorage::get_all(&ctx_locked.db).unwrap_or_default();
let item_appearances = ItemAppearances::get_all(&ctx_locked.db).unwrap_or_default(); let item_appearances = db::ItemAppearances::get_all(&ctx_locked.db).unwrap_or_default();
let listings = Listing::get_all(&ctx_locked.db).unwrap_or_default(); let listings = db::Listing::get_all(&ctx_locked.db).unwrap_or_default();
let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0; let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
let html_gen_start_time = Instant::now(); let html_gen_start_time = Instant::now();
@@ -313,17 +369,19 @@ async fn main() -> std::io::Result<()> {
let app_data = Data::new(Mutex::new(AppCtx { let app_data = Data::new(Mutex::new(AppCtx {
download_dir: scrapedatadir.clone(), download_dir: scrapedatadir.clone(),
db: get_initialized(None), db: db::get_initialized(None),
db_llm: { db_llm: {
let db_path = scrapedatadir.with_file_name("llm.sqlite"); let db_path = scrapedatadir.with_file_name("llm.sqlite");
let db = rusqlite::Connection::open(db_path).unwrap(); let db = rusqlite::Connection::open(&db_path).unwrap();
ParsedLLMStorageResult::initialize(&db); db::ParsedLLMStorageResult::initialize(&db);
info!("Created {:?} for caching LLM parsed title.", db_path);
db db
}, },
llm_parser: None,
})); }));
// Prepare our backend via pulling in what catagories we are preconfigured with. // Prepare our backend via pulling in what catagories we are preconfigured with.
SearchURL::scan(&app_data.lock().unwrap().db, &scrapedatadir, "url.json"); db::SearchURL::scan(&app_data.lock().unwrap().db, &scrapedatadir, "url.json");
HttpServer::new(move || { HttpServer::new(move || {
App::new() App::new()

View File

@@ -32,7 +32,7 @@ pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage {
let upper_title = title.to_uppercase(); let upper_title = title.to_uppercase();
let mut total_gb = 0i64; let mut total_gb = 0i64;
let mut quantity = 1i64; let mut quantity = 1i64;
let mut needed_description_check = false; let mut failed_reason = String::new();
let mut individual_size_gb = 0i64; let mut individual_size_gb = 0i64;
for pattern in EXPLICIT_QTY_PATTERNS.iter() { for pattern in EXPLICIT_QTY_PATTERNS.iter() {
@@ -68,36 +68,35 @@ pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage {
if !unique_sizes_gb.is_empty() { if !unique_sizes_gb.is_empty() {
individual_size_gb = unique_sizes_gb[0]; individual_size_gb = unique_sizes_gb[0];
if unique_sizes_gb.len() > 1 { if unique_sizes_gb.len() > 1 {
needed_description_check = true; failed_reason = "Mixed Sizes".to_owned();
} }
} }
} }
if SIZE_RANGE_REGEX.is_match(&upper_title) { if SIZE_RANGE_REGEX.is_match(&upper_title) {
needed_description_check = true; failed_reason = "No Size Given".to_owned();
} }
if quantity > 1 && upper_title.contains("MIXED") { if quantity > 1 && upper_title.contains("MIXED") {
needed_description_check = true; failed_reason = "Mixed Sizes".to_owned();
} }
if upper_title.contains("CHECK THE DESCRIPTION") if upper_title.contains("CHECK THE DESCRIPTION")
|| upper_title.contains("CHECK DESCRIPTION") || upper_title.contains("CHECK DESCRIPTION")
|| upper_title.contains("SEE DESCRIPTION") || upper_title.contains("SEE DESCRIPTION")
{ {
if quantity > 1 || size_matches.is_empty() || size_matches.len() > 1 { if quantity > 1 || size_matches.is_empty() || size_matches.len() > 1 {
needed_description_check = true; failed_reason = "Mixed Sizes".to_owned();
} }
} }
if upper_title.contains("READ") {
failed_reason = "Mixed Sizes".to_owned();
}
if individual_size_gb > 0 { if individual_size_gb > 0 {
total_gb = individual_size_gb * quantity; total_gb = individual_size_gb * quantity;
} }
if quantity > 1 && total_gb == 0 && !size_matches.is_empty() { if quantity > 1 && total_gb == 0 && !size_matches.is_empty() {
needed_description_check = true; failed_reason = "No size given".to_owned();
}
if quantity == 1 && size_matches.len() == 1 && !needed_description_check {
// This condition is implicitly handled
} }
ParsedStorage { ParsedStorage {
@@ -106,8 +105,8 @@ pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage {
total_gigabytes: total_gb, total_gigabytes: total_gb,
quantity, quantity,
individual_size_gigabytes: individual_size_gb, individual_size_gigabytes: individual_size_gb,
needed_description_check, failed_reason: failed_reason,
parse_engine: 0, parse_engine: crate::db::StorageParsingEngineVersion::Regex,
} }
} }
@@ -125,8 +124,8 @@ mod tests {
total_gigabytes: 512 * 3, total_gigabytes: 512 * 3,
quantity: 3, quantity: 3,
individual_size_gigabytes: 512, individual_size_gigabytes: 512,
parse_engine: 0, parse_engine: crate::db::StorageParsingEngineVersion::Regex,
needed_description_check: false, failed_reason: String::new(),
}, },
true, true,
), ),
@@ -138,8 +137,8 @@ mod tests {
total_gigabytes: 240, total_gigabytes: 240,
quantity: 1, quantity: 1,
individual_size_gigabytes: 240, individual_size_gigabytes: 240,
parse_engine: 0, parse_engine: crate::db::StorageParsingEngineVersion::Regex,
needed_description_check: false, failed_reason: String::new(),
}, },
true, true,
), ),
@@ -151,8 +150,8 @@ mod tests {
total_gigabytes: 1024, total_gigabytes: 1024,
quantity: 1, quantity: 1,
individual_size_gigabytes: 1024, individual_size_gigabytes: 1024,
parse_engine: 0, parse_engine: crate::db::StorageParsingEngineVersion::Regex,
needed_description_check: true, failed_reason: String::new(),
}, },
false, // Sadly this one fails :/ false, // Sadly this one fails :/
), ),
@@ -164,8 +163,8 @@ mod tests {
total_gigabytes: 7 * 1024, total_gigabytes: 7 * 1024,
quantity: 1, quantity: 1,
individual_size_gigabytes: 7 * 1024, individual_size_gigabytes: 7 * 1024,
parse_engine: 0, parse_engine: crate::db::StorageParsingEngineVersion::Regex,
needed_description_check: false, failed_reason: String::new(),
}, },
true, true,
), ),
@@ -177,8 +176,8 @@ mod tests {
total_gigabytes: 6 * 256, total_gigabytes: 6 * 256,
quantity: 6, quantity: 6,
individual_size_gigabytes: 256, individual_size_gigabytes: 256,
parse_engine: 0, parse_engine: crate::db::StorageParsingEngineVersion::Regex,
needed_description_check: false, failed_reason: String::new(),
}, },
true, true,
), ),
@@ -190,8 +189,8 @@ mod tests {
total_gigabytes: 1966, total_gigabytes: 1966,
quantity: 1, quantity: 1,
individual_size_gigabytes: 1966, individual_size_gigabytes: 1966,
parse_engine: 0, parse_engine: crate::db::StorageParsingEngineVersion::Regex,
needed_description_check: false, failed_reason: String::new(),
}, },
true, true,
), ),

View File

@@ -1,10 +1,10 @@
use crate::db::ParsedLLMStorageResult; use crate::db::{
use crate::db::ParsedStorage; ParsedLLMStorageResult, ParsedStorage, StorageLLMVersion, StorageParsingEngineVersion,
use actix_web::mime::APPLICATION_JSON; };
use reqwest::header::AUTHORIZATION; use reqwest::header::{AUTHORIZATION, CONTENT_TYPE};
use reqwest::header::CONTENT_TYPE;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_json::json; use serde_json::json;
use tracing::error;
// Given this prompt and a string of "(Lot of 6) Samsung MZ-VLB2560 256GB M.2NVMe Internal SSD // Given this prompt and a string of "(Lot of 6) Samsung MZ-VLB2560 256GB M.2NVMe Internal SSD
// (MZVLB256HBHQ-000H1)", we get 338 input tokens and 36 output tokens. Assuming no caching, then // (MZVLB256HBHQ-000H1)", we get 338 input tokens and 36 output tokens. Assuming no caching, then
@@ -28,22 +28,6 @@ And an example for an unclear title of "Pallet of assorted 128GB to 5TB drives";
} }
"#; "#;
fn create_request(title: &str) -> serde_json::Value {
json!({
"model": "gemini-2.5-flash-lite",
"messages": [
{
"role": "system",
"content": SYSTEM_PROMPT
},
{
"role": "user",
"content": title
}
]
})
}
#[derive(Deserialize, Serialize, Debug, PartialEq, Clone)] #[derive(Deserialize, Serialize, Debug, PartialEq, Clone)]
struct LLMParsedResponse { struct LLMParsedResponse {
pub quantity: i64, pub quantity: i64,
@@ -66,40 +50,111 @@ struct OpenAIMessage {
content: String, content: String,
} }
/// Parses size and quantity information from an item title. #[cfg(test)]
pub async fn parse_size_and_quantity( const OPENAI_LLM_URL: &str = "https://badurl.hak8or.com/litellm_api/chat/completions";
db: &rusqlite::Connection, #[cfg(not(test))]
const OPENAI_LLM_URL: &str = "https://ai.hak8or.com/litellm_api/chat/completions";
#[cfg(test)]
const OPENAI_LLM_API_KEY: &str = "Bearer sk-YmVlcC1ib29wLWEtcm9ib3Q";
#[cfg(not(test))]
const OPENAI_LLM_API_KEY: &str = "Bearer sk-HMGML94x2ag6ggOoDghSGA";
pub async fn parse_size_and_quantity_llm(
item_id: i64, item_id: i64,
title: &str, title: &str,
) -> Option<ParsedStorage> { ) -> Option<ParsedLLMStorageResult> {
let client = reqwest::Client::new(); let client = reqwest::Client::new();
let req = client let req = client
.post("https://ai.hak8or.com/litellm_api/chat/completions") .post(OPENAI_LLM_URL)
.header(CONTENT_TYPE, APPLICATION_JSON.to_string()) .header(CONTENT_TYPE, actix_web::mime::APPLICATION_JSON.to_string())
.header(AUTHORIZATION, "Bearer sk-HMGML94x2ag6ggOoDghSGA") .header(AUTHORIZATION, OPENAI_LLM_API_KEY)
.body(create_request(title).to_string()); .body(
json!({
"model": "gemini-2.5-flash-lite",
"reasoning_effort": "disable",
"thinking": {"type": "disabled", "budget_tokens": 0},
"messages": [
{ "role": "system", "content": SYSTEM_PROMPT },
{ "role": "user", "content": title }
]
})
.to_string(),
);
let reply_body = req.send().await.ok()?.text().await.ok()?; let reply_body = req.send().await.ok()?.text().await.ok()?;
let repl_json: OpenAIResponse = serde_json::from_str(&reply_body).ok()?; let repl_json: OpenAIResponse = serde_json::from_str(&reply_body).ok()?;
match repl_json.choices.len() {
0 => {
error!("When parsing title, LLM returned ZERO choices");
return None;
}
1 => { /* Nothing to do */ }
a => error!("When parsing title, LLM returned {a}, >1 choices, using first!"),
}
let reply_parsed_storage_json: LLMParsedResponse = let reply_parsed_storage_json: LLMParsedResponse =
serde_json::from_str(&repl_json.choices[0].message.content).ok()?; serde_json::from_str(&repl_json.choices[0].message.content).ok()?;
let plsr = ParsedLLMStorageResult { if !reply_parsed_storage_json.fail_reason.is_empty() {
error!(
"Failed parsing item_id:{item_id}, title:{title}, due to reason:{}",
reply_parsed_storage_json.fail_reason
);
}
Some(ParsedLLMStorageResult {
id: 0, id: 0,
fail_reason: reply_parsed_storage_json.fail_reason.clone(), fail_reason: reply_parsed_storage_json.fail_reason.clone(),
gigabytes: reply_parsed_storage_json.gigabytes, gigabytes: reply_parsed_storage_json.gigabytes,
item_id, item_id,
quantity: reply_parsed_storage_json.quantity, quantity: reply_parsed_storage_json.quantity,
title: title.to_owned(), title: title.to_owned(),
}; llm_id: StorageLLMVersion::Gemini2d5Prompt0,
})
}
// Since we can't have a hashmap in a const, and I don't want to play with
// making our parsed result struct contain a CoW string for fail_reason and
// title, we are stuck with this ...
pub fn parse_cached(item_id: i64, title: &str) -> Option<ParsedLLMStorageResult> {
match title {
"Lot of 2 512GB SSD 6gb/s working with 5% wear" => Some(ParsedLLMStorageResult {
id: 0,
item_id: item_id,
fail_reason: "".to_string(),
gigabytes: 512,
quantity: 2,
title: title.to_owned(),
llm_id: StorageLLMVersion::Testing,
}),
"Lot of 2 assorted SSD" => Some(ParsedLLMStorageResult {
id: 0,
fail_reason: "mixed sizes".to_owned(),
gigabytes: 0,
item_id,
quantity: 0,
title: title.to_owned(),
llm_id: StorageLLMVersion::Testing,
}),
_ => None,
}
}
/// Parses size and quantity information from an item title.
pub async fn parse_size_and_quantity(
db: &rusqlite::Connection,
item_id: i64,
title: &str,
) -> Option<ParsedStorage> {
let plsr = parse_size_and_quantity_llm(item_id, title).await?;
plsr.add_or_update(&db); plsr.add_or_update(&db);
Some(ParsedStorage { Some(ParsedStorage {
id: 0, id: 0,
item: item_id, item: item_id,
total_gigabytes: reply_parsed_storage_json.quantity * reply_parsed_storage_json.gigabytes, total_gigabytes: plsr.quantity * plsr.gigabytes,
quantity: reply_parsed_storage_json.quantity, quantity: plsr.quantity,
individual_size_gigabytes: reply_parsed_storage_json.gigabytes, individual_size_gigabytes: plsr.gigabytes,
needed_description_check: !reply_parsed_storage_json.fail_reason.is_empty(), failed_reason: plsr.fail_reason,
parse_engine: 1, parse_engine: StorageParsingEngineVersion::LLM,
}) })
} }