Moaaarrr
All checks were successful
Cargo Build & Test / Rust project - latest (1.86) (push) Successful in 4m26s
Cargo Build & Test / Rust project - latest (1.87) (push) Successful in 4m42s
Cargo Build & Test / Rust project - latest (1.85.1) (push) Successful in 5m15s
Cargo Build & Test / Rust project - latest (1.88) (push) Successful in 11m40s

This commit is contained in:
2025-09-09 00:31:23 -04:00
parent 4ae1622f02
commit 0039078f41
6 changed files with 416 additions and 131 deletions

98
Cargo.lock generated
View File

@@ -695,7 +695,9 @@ dependencies = [
"chrono",
"clap",
"dirs",
"futures",
"lazy_static",
"num_enum",
"rayon",
"regex",
"reqwest",
@@ -848,6 +850,21 @@ dependencies = [
"new_debug_unreachable",
]
[[package]]
name = "futures"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
dependencies = [
"futures-channel",
"futures-core",
"futures-executor",
"futures-io",
"futures-sink",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-channel"
version = "0.3.31"
@@ -864,12 +881,34 @@ version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
[[package]]
name = "futures-executor"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f"
dependencies = [
"futures-core",
"futures-task",
"futures-util",
]
[[package]]
name = "futures-io"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
[[package]]
name = "futures-macro"
version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "futures-sink"
version = "0.3.31"
@@ -888,8 +927,10 @@ version = "0.3.31"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
dependencies = [
"futures-channel",
"futures-core",
"futures-io",
"futures-macro",
"futures-sink",
"futures-task",
"memchr",
@@ -1566,6 +1607,28 @@ dependencies = [
"autocfg",
]
[[package]]
name = "num_enum"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a973b4e44ce6cad84ce69d797acf9a044532e4184c4f267913d1b546a0727b7a"
dependencies = [
"num_enum_derive",
"rustversion",
]
[[package]]
name = "num_enum_derive"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77e878c846a8abae00dd069496dbe8751b16ac1c3d6bd2a7283a938e8228f90d"
dependencies = [
"proc-macro-crate",
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "object"
version = "0.36.7"
@@ -1772,6 +1835,15 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "proc-macro-crate"
version = "3.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "edce586971a4dfaa28950c6f18ed55e0406c1ab88bbce2c6f6293a7aaba73d35"
dependencies = [
"toml_edit",
]
[[package]]
name = "proc-macro2"
version = "1.0.95"
@@ -2534,6 +2606,23 @@ dependencies = [
"tokio",
]
[[package]]
name = "toml_datetime"
version = "0.6.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c"
[[package]]
name = "toml_edit"
version = "0.22.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a"
dependencies = [
"indexmap",
"toml_datetime",
"winnow",
]
[[package]]
name = "tower"
version = "0.5.2"
@@ -3009,6 +3098,15 @@ version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "winnow"
version = "0.7.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "21a0236b59786fed61e2a80582dd500fe61f18b5dca67a4a067d0bc9039339cf"
dependencies = [
"memchr",
]
[[package]]
name = "wit-bindgen-rt"
version = "0.39.0"

View File

@@ -8,7 +8,9 @@ actix-web = "4.11.0"
chrono = { version = "0.4.41", features = ["serde"] }
clap = { version = "4.5.40", features = ["derive"] }
dirs = "6.0.0"
futures = "0.3.31"
lazy_static = "1.5.0"
num_enum = "0.7.4"
rayon = "1.10.0"
regex = "1.11.1"
reqwest = { version = "0.12.23", features = ["blocking"] }

153
src/db.rs
View File

@@ -1,5 +1,8 @@
use chrono::{DateTime, Utc};
use num_enum::TryFromPrimitive;
use rusqlite::Connection;
use rusqlite::ToSql;
use rusqlite::types::FromSql;
use serde::Deserialize;
use serde::Serialize;
use std::path::Path;
@@ -205,15 +208,34 @@ impl ParsedPage {
}
}
#[derive(Serialize, Debug, PartialEq, Copy, Clone)]
#[repr(i64)]
#[derive(Serialize, Debug, PartialEq, Copy, Clone, PartialOrd, Ord, Eq, TryFromPrimitive)]
pub enum StorageParsingEngineVersion {
Testing = 0,
Regex = 1,
LLM = 2,
}
impl ToSql for StorageParsingEngineVersion {
fn to_sql(&self) -> rusqlite::Result<rusqlite::types::ToSqlOutput<'_>> {
Ok((*self as i64).into())
}
}
impl FromSql for StorageParsingEngineVersion {
fn column_result(value: rusqlite::types::ValueRef<'_>) -> rusqlite::types::FromSqlResult<Self> {
let v = value.as_i64()?;
Self::try_from(v).map_err(|_| rusqlite::types::FromSqlError::OutOfRange(v))
}
}
#[derive(Serialize, Debug, PartialEq, Clone)]
pub struct ParsedStorage {
pub id: i64,
pub item: i64,
pub total_gigabytes: i64,
pub quantity: i64,
pub individual_size_gigabytes: i64,
pub parse_engine: i64,
pub needed_description_check: bool,
pub parse_engine: StorageParsingEngineVersion,
pub failed_reason: String,
}
impl DBTable for ParsedStorage {
const TABLE_NAME: &'static str = "Storage_Parsed";
@@ -224,13 +246,13 @@ impl DBTable for ParsedStorage {
quantity INTEGER,
sizes_gigabytes TEXT,
parse_engine INTEGER,
need_description_check INTEGER,
failed_reason TEXT,
UNIQUE(item, parse_engine)
FOREIGN KEY(item) REFERENCES Listings(item_id)
";
fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> {
let mut stmt = conn.prepare(&format!("SELECT id, item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, need_description_check FROM {}", Self::TABLE_NAME))?;
let mut stmt = conn.prepare(&format!("SELECT id, item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, failed_reason FROM {}", Self::TABLE_NAME))?;
let iter = stmt.query_map([], |row| {
Ok(ParsedStorage {
id: row.get(0)?,
@@ -242,7 +264,7 @@ impl DBTable for ParsedStorage {
r.parse().unwrap_or(0)
},
parse_engine: row.get(5)?,
needed_description_check: row.get(6)?,
failed_reason: row.get(6)?,
})
})?;
@@ -273,7 +295,7 @@ impl ParsedStorage {
r.parse().unwrap()
},
parse_engine: row.get(5)?,
needed_description_check: row.get(6)?,
failed_reason: row.get(6)?,
})
})
.ok()
@@ -283,21 +305,26 @@ impl ParsedStorage {
}
pub fn add_or_update(&self, conn: &Connection) {
let _ = conn.execute(&format!("
let _ = conn
.execute(
&format!(
"
INSERT OR REPLACE INTO {}
(item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, need_description_check)
(item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, failed_reason)
VALUES
(?1, ?2, ?3, ?4, ?5, ?6)",
Self::TABLE_NAME),
(
&self.item,
self.total_gigabytes,
self.quantity,
self.individual_size_gigabytes.to_string(),
self.parse_engine,
self.needed_description_check
Self::TABLE_NAME
),
(
&self.item,
self.total_gigabytes,
self.quantity,
self.individual_size_gigabytes.to_string(),
self.parse_engine,
&self.failed_reason,
),
)
).unwrap();
.unwrap();
}
}
@@ -494,19 +521,40 @@ impl Listing {
.collect()
}
pub fn lookup_non_parsed(conn: &Connection) -> Vec<(i64, String)> {
let mut stmt = conn
.prepare(&format!(
"
SELECT ei.item_id, ei.title FROM {} AS ei
LEFT JOIN {} AS sp ON ei.item_id = sp.item
WHERE sp.item IS NULL",
Self::TABLE_NAME,
ParsedStorage::TABLE_NAME
))
pub fn lookup_pending_parse(
conn: &Connection,
allowed_engines: &[i64],
count_limit: u64,
) -> Vec<(i64, String)> {
let engines_filter = if !allowed_engines.is_empty() {
format!(
"AND ({})",
allowed_engines
.iter()
.map(|e| "ps.parse_engine = ".to_owned() + &e.to_string())
.collect::<Vec<_>>()
.join(" OR ")
)
} else {
String::new()
};
let query = format!(
"
SELECT listing.item_id, listing.title FROM {0} AS listing
WHERE NOT EXISTS (
SELECT 1 FROM {1} AS ps
WHERE listing.item_id = ps.item {engines_filter}
)
LIMIT {count_limit}
",
Self::TABLE_NAME,
ParsedStorage::TABLE_NAME
);
conn.prepare(&query)
.ok()
.unwrap();
stmt.query_map([], |row| Ok((row.get(0)?, row.get(1)?)))
.unwrap()
.query_map([], |row| Ok((row.get(0)?, row.get(1)?)))
.ok()
.unwrap()
.map(|e| e.unwrap())
@@ -565,7 +613,7 @@ pub fn listings_get_filtered(
history: ItemAppearances::lookup(conn, l.item_id),
parsed: ParsedStorage::lookup(conn, l.item_id),
})
.filter(|lr| lr.parsed.iter().any(|p| !p.needed_description_check))
.filter(|lr| lr.parsed.iter().any(|p| p.failed_reason.is_empty()))
.collect::<Vec<ListingsFilterResult>>();
info!(
"Found total {} listings since (str:{} epoch:{})",
@@ -614,6 +662,24 @@ pub fn listings_get_filtered(
listings
}
#[repr(i64)]
#[derive(Serialize, Debug, PartialEq, Copy, Clone, TryFromPrimitive)]
pub enum StorageLLMVersion {
Testing = 0,
Gemini2d5Prompt0 = 1,
}
impl ToSql for StorageLLMVersion {
fn to_sql(&self) -> rusqlite::Result<rusqlite::types::ToSqlOutput<'_>> {
Ok((*self as i64).into())
}
}
impl FromSql for StorageLLMVersion {
fn column_result(value: rusqlite::types::ValueRef<'_>) -> rusqlite::types::FromSqlResult<Self> {
let v = value.as_i64()?;
Self::try_from(v).map_err(|_| rusqlite::types::FromSqlError::OutOfRange(v))
}
}
// This is mostly meant as a way to cache all of these.
#[derive(Serialize, Debug, PartialEq, Clone)]
pub struct ParsedLLMStorageResult {
@@ -623,6 +689,7 @@ pub struct ParsedLLMStorageResult {
pub quantity: i64,
pub gigabytes: i64,
pub fail_reason: String,
pub llm_id: StorageLLMVersion,
}
impl DBTable for ParsedLLMStorageResult {
const TABLE_NAME: &'static str = "ParsedLLMStorageResult";
@@ -632,12 +699,13 @@ impl DBTable for ParsedLLMStorageResult {
title TEXT NOT NULL,
quantity INTEGER NOT NULL,
gigabytes INTEGER NOT NULL,
fail_reason TEXT NOT NULL
fail_reason TEXT NOT NULL,
llm_id INTEGER NOT NULL
";
fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> {
let mut stmt = conn.prepare(&format!(
"SELECT id, item_id, title, quantity, gigabytes, fail_reason FROM {}",
"SELECT id, item_id, title, quantity, gigabytes, fail_reason, llm_id FROM {}",
Self::TABLE_NAME
))?;
let iter = stmt.query_map([], |row| {
@@ -648,6 +716,7 @@ impl DBTable for ParsedLLMStorageResult {
quantity: row.get(3)?,
gigabytes: row.get(4)?,
fail_reason: row.get(5)?,
llm_id: row.get(6)?,
})
})?;
@@ -674,6 +743,7 @@ impl ParsedLLMStorageResult {
quantity: row.get(3)?,
gigabytes: row.get(4)?,
fail_reason: row.get(5)?,
llm_id: row.get(6)?,
})
})
.ok()
@@ -689,9 +759,10 @@ impl ParsedLLMStorageResult {
title,
quantity,
gigabytes,
fail_reason
fail_reason,
llm_id
)
VALUES (?1, ?2, ?3, ?4, ?5)",
VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
Self::TABLE_NAME
),
(
@@ -700,6 +771,7 @@ impl ParsedLLMStorageResult {
self.quantity,
self.gigabytes,
self.fail_reason.clone(),
self.llm_id,
),
)
.unwrap();
@@ -733,7 +805,7 @@ pub struct Stats {
rows_parsed_storage: i64,
rows_parsed_page: i64,
rows_item_appearances: i64,
// pub rows_parsed_storage_llm: i64,
pub rows_parsed_storage_llm: i64,
}
pub fn get_stats(conn: &Connection) -> Stats {
@@ -743,7 +815,7 @@ pub fn get_stats(conn: &Connection) -> Stats {
rows_parsed_storage: ParsedStorage::get_count(conn),
rows_parsed_page: ParsedPage::get_count(conn),
rows_item_appearances: ItemAppearances::get_count(conn),
// rows_parsed_storage_llm: ParsedLLMStorageResult::get_count(conn),
rows_parsed_storage_llm: ParsedLLMStorageResult::get_count(conn),
}
}
@@ -765,7 +837,7 @@ mod tests {
let listing = Listing {
id: 1,
item_id: 1234,
title: "Some Title".to_string(),
title: "Lot of 2 512GB SSD 6gb/s working with 5% wear".to_string(),
buy_it_now_price_cents: Some(123),
has_best_offer: false,
image_url: "google.com".to_string(),
@@ -779,8 +851,8 @@ mod tests {
total_gigabytes: 13,
quantity: 3,
individual_size_gigabytes: 13,
parse_engine: 9,
needed_description_check: true,
parse_engine: StorageParsingEngineVersion::Testing,
failed_reason: "".to_owned(),
};
parsed.add_or_update(&db);
assert_eq!(ParsedStorage::lookup(&db, listing.item_id), vec![parsed]);
@@ -811,6 +883,7 @@ mod tests {
item_id: 12345,
quantity: 32,
title: "Some Title".to_owned(),
llm_id: StorageLLMVersion::Testing,
};
parsedllmstorage.add_or_update(&db);
assert_eq!(

View File

@@ -1,12 +1,11 @@
use actix_web::{App, HttpServer, Responder, Result, get, post, web, web::Data};
use chrono::{DateTime, Utc};
use clap::Parser;
use ebay_scraper_rust::db::{
DBTable, ItemAppearances, Listing, ParsedLLMStorageResult, ParsedPage, ParsedStorage,
SearchURL, get_initialized, get_stats, listings_get_filtered,
};
use ebay_scraper_rust::db;
use ebay_scraper_rust::db::DBTable;
use ebay_scraper_rust::parser::parse_dir;
use ebay_scraper_rust::{parser_storage_e0, parser_storage_e1};
use futures::future::join_all;
use serde::{Deserialize, Serialize};
use std::path::PathBuf;
use std::sync::Mutex;
@@ -32,6 +31,7 @@ struct AppCtx {
db: rusqlite::Connection,
db_llm: rusqlite::Connection,
download_dir: PathBuf,
llm_parser: Option<actix_web::rt::task::JoinHandle<()>>,
}
#[derive(Deserialize, Debug)]
@@ -47,7 +47,7 @@ async fn listings_filtered_get(
filter: web::Query<ListingsFilter>,
) -> Result<impl Responder> {
let start = Instant::now();
let res = listings_get_filtered(
let res = db::listings_get_filtered(
&ctx.lock().unwrap().db,
&DateTime::<Utc>::from_timestamp(filter.since.unwrap_or(0), 0).unwrap(),
filter.limit.unwrap_or(1_000),
@@ -64,12 +64,12 @@ async fn listings_filtered_get(
#[get("/listing/{id}")]
async fn listing_get(ctx: Data<Mutex<AppCtx>>, id: web::Path<i64>) -> Result<impl Responder> {
Ok(web::Json(Listing::lookup(&ctx.lock().unwrap().db, *id)))
Ok(web::Json(db::Listing::lookup(&ctx.lock().unwrap().db, *id)))
}
#[get("/listing/{id}/parsed")]
async fn listing_parse_get(ctx: Data<Mutex<AppCtx>>, id: web::Path<i64>) -> Result<impl Responder> {
Ok(web::Json(ParsedStorage::lookup(
Ok(web::Json(db::ParsedStorage::lookup(
&ctx.lock().unwrap().db,
*id,
)))
@@ -86,7 +86,7 @@ async fn listing_history_get(
ctx: Data<Mutex<AppCtx>>,
id: web::Path<i64>,
) -> Result<impl Responder> {
let history: Vec<_> = ItemAppearances::lookup(&ctx.lock().unwrap().db, *id)
let history: Vec<_> = db::ItemAppearances::lookup(&ctx.lock().unwrap().db, *id)
.iter()
// .inspect(|e| info!("got: {:?}", e))
.filter_map(|e| {
@@ -99,29 +99,85 @@ async fn listing_history_get(
Ok(web::Json(history))
}
async fn storage_parse_work(entries: &[(i64, String)]) -> Vec<db::ParsedStorage> {
let llm_futures: Vec<_> = entries
.iter()
.map(|(id, title)| parser_storage_e1::parse_size_and_quantity_llm(*id, title))
.collect();
let llm_future_results = join_all(llm_futures).await;
let llm_results = llm_future_results
.iter()
.flatten()
.map(|e| db::ParsedStorage {
id: 0,
item: e.item_id,
total_gigabytes: e.quantity * e.gigabytes,
quantity: e.quantity,
individual_size_gigabytes: e.gigabytes,
failed_reason: e.fail_reason.clone(),
parse_engine: db::StorageParsingEngineVersion::LLM,
});
// .inspect(|e| e.add_or_update(&unlocked.db))
// .map(|e| db::ParsedStorage {
// id: 0,
// item: e.item_id,
// total_gigabytes: e.quantity * e.gigabytes,
// quantity: e.quantity,
// individual_size_gigabytes: e.gigabytes,
// needed_description_check: !e.fail_reason.is_empty(),
// parse_engine: db::StorageParsingEngineVersion::LLM,
// })
// .for_each(|e| e.add_or_update(&unlocked.db));
// And a regex based parse.
let regex_results = entries
.iter()
.map(|(id, title)| parser_storage_e0::parse_size_and_quantity(*id, &title));
// .for_each(|e| e.add_or_update(&unlocked.db));
regex_results.chain(llm_results).collect()
}
fn storage_parse_worker(ctx: Data<Mutex<AppCtx>>) -> actix_web::rt::task::JoinHandle<()> {
actix_web::rt::spawn(async move {
loop {
actix_web::rt::time::sleep(std::time::Duration::from_millis(1000)).await;
let ctx_unlocked = ctx.lock().unwrap();
let entries = db::Listing::lookup_pending_parse(&ctx_unlocked.db, &[], 10);
let parsed = storage_parse_work(entries.as_slice()).await;
for p in parsed {
p.add_or_update(&ctx_unlocked.db);
}
}
})
}
#[post("/listing/parse")]
async fn parse_listings(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
// Prepare a background parser to go through and use an LLM to parse the
// storage info.
if ctx.lock().unwrap().llm_parser.is_none() {
ctx.clone().lock().unwrap().llm_parser = Some(storage_parse_worker(ctx.clone()));
}
// Lets grab a few entries and then try parsing them with two engines.
let ctx_locked = ctx.lock().unwrap();
let entries: Vec<_> = Listing::lookup_non_parsed(&ctx_locked.db)
let entries: Vec<_> = db::Listing::lookup_pending_parse(&ctx_locked.db, &[], 100)
.iter()
.take(10)
.map(|e| e.clone())
.collect();
for (item_id, title) in &entries {
let ps0 = parser_storage_e0::parse_size_and_quantity(*item_id, &title);
ps0.add_or_update(&ctx_locked.db);
let ps1 =
parser_storage_e1::parse_size_and_quantity(&ctx_locked.db_llm, *item_id, &title).await;
if ps1.is_some() {
info!(
"Parsed using an LLM title:{} and results:{:?}",
title,
ps1.unwrap()
);
// info!(
// "Parsed using an LLM title:{} and results:{:?}",
// title,
// ps1.unwrap()
// );
ps1.unwrap().add_or_update(&ctx_locked.db);
ps1.unwrap().add_or_update(&ctx_locked.db_llm);
// ps1.unwrap().add_or_update(&ctx_locked.db_llm); No need
} else {
error!("Failed to parse {item_id} with title {title}");
}
@@ -131,7 +187,7 @@ async fn parse_listings(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
#[get("/category")]
async fn category_getnames(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
Ok(web::Json(SearchURL::names(&ctx.lock().unwrap().db)))
Ok(web::Json(db::SearchURL::names(&ctx.lock().unwrap().db)))
}
#[post("/category/{category}/parse")]
@@ -156,9 +212,9 @@ async fn category_parse(
#[get("/stats")]
async fn stats_get(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
let stats_db = get_stats(&ctx.lock().unwrap().db);
// let stats_db_llm = get_stats(&ctx.lock().unwrap().db_llm);
// stats_db.rows_parsed_storage_llm = stats_db_llm.rows_parsed_storage_llm;
let mut stats_db = db::get_stats(&ctx.lock().unwrap().db);
let stats_db_llm = db::get_stats(&ctx.lock().unwrap().db_llm);
stats_db.rows_parsed_storage_llm = stats_db_llm.rows_parsed_storage_llm;
Ok(web::Json(stats_db))
}
@@ -166,11 +222,11 @@ async fn stats_get(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
async fn admin_get(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
let ctx_locked = ctx.lock().unwrap();
let query_start_time = Instant::now();
let search_urls = SearchURL::get_all(&ctx_locked.db).unwrap_or_default();
let parsed_pages = ParsedPage::get_all(&ctx_locked.db).unwrap_or_default();
let parsed_storages = ParsedStorage::get_all(&ctx_locked.db).unwrap_or_default();
let item_appearances = ItemAppearances::get_all(&ctx_locked.db).unwrap_or_default();
let listings = Listing::get_all(&ctx_locked.db).unwrap_or_default();
let search_urls = db::SearchURL::get_all(&ctx_locked.db).unwrap_or_default();
let parsed_pages = db::ParsedPage::get_all(&ctx_locked.db).unwrap_or_default();
let parsed_storages = db::ParsedStorage::get_all(&ctx_locked.db).unwrap_or_default();
let item_appearances = db::ItemAppearances::get_all(&ctx_locked.db).unwrap_or_default();
let listings = db::Listing::get_all(&ctx_locked.db).unwrap_or_default();
let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
let html_gen_start_time = Instant::now();
@@ -313,17 +369,19 @@ async fn main() -> std::io::Result<()> {
let app_data = Data::new(Mutex::new(AppCtx {
download_dir: scrapedatadir.clone(),
db: get_initialized(None),
db: db::get_initialized(None),
db_llm: {
let db_path = scrapedatadir.with_file_name("llm.sqlite");
let db = rusqlite::Connection::open(db_path).unwrap();
ParsedLLMStorageResult::initialize(&db);
let db = rusqlite::Connection::open(&db_path).unwrap();
db::ParsedLLMStorageResult::initialize(&db);
info!("Created {:?} for caching LLM parsed title.", db_path);
db
},
llm_parser: None,
}));
// Prepare our backend via pulling in what catagories we are preconfigured with.
SearchURL::scan(&app_data.lock().unwrap().db, &scrapedatadir, "url.json");
db::SearchURL::scan(&app_data.lock().unwrap().db, &scrapedatadir, "url.json");
HttpServer::new(move || {
App::new()

View File

@@ -32,7 +32,7 @@ pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage {
let upper_title = title.to_uppercase();
let mut total_gb = 0i64;
let mut quantity = 1i64;
let mut needed_description_check = false;
let mut failed_reason = String::new();
let mut individual_size_gb = 0i64;
for pattern in EXPLICIT_QTY_PATTERNS.iter() {
@@ -68,36 +68,35 @@ pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage {
if !unique_sizes_gb.is_empty() {
individual_size_gb = unique_sizes_gb[0];
if unique_sizes_gb.len() > 1 {
needed_description_check = true;
failed_reason = "Mixed Sizes".to_owned();
}
}
}
if SIZE_RANGE_REGEX.is_match(&upper_title) {
needed_description_check = true;
failed_reason = "No Size Given".to_owned();
}
if quantity > 1 && upper_title.contains("MIXED") {
needed_description_check = true;
failed_reason = "Mixed Sizes".to_owned();
}
if upper_title.contains("CHECK THE DESCRIPTION")
|| upper_title.contains("CHECK DESCRIPTION")
|| upper_title.contains("SEE DESCRIPTION")
{
if quantity > 1 || size_matches.is_empty() || size_matches.len() > 1 {
needed_description_check = true;
failed_reason = "Mixed Sizes".to_owned();
}
}
if upper_title.contains("READ") {
failed_reason = "Mixed Sizes".to_owned();
}
if individual_size_gb > 0 {
total_gb = individual_size_gb * quantity;
}
if quantity > 1 && total_gb == 0 && !size_matches.is_empty() {
needed_description_check = true;
}
if quantity == 1 && size_matches.len() == 1 && !needed_description_check {
// This condition is implicitly handled
failed_reason = "No size given".to_owned();
}
ParsedStorage {
@@ -106,8 +105,8 @@ pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage {
total_gigabytes: total_gb,
quantity,
individual_size_gigabytes: individual_size_gb,
needed_description_check,
parse_engine: 0,
failed_reason: failed_reason,
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
}
}
@@ -125,8 +124,8 @@ mod tests {
total_gigabytes: 512 * 3,
quantity: 3,
individual_size_gigabytes: 512,
parse_engine: 0,
needed_description_check: false,
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
failed_reason: String::new(),
},
true,
),
@@ -138,8 +137,8 @@ mod tests {
total_gigabytes: 240,
quantity: 1,
individual_size_gigabytes: 240,
parse_engine: 0,
needed_description_check: false,
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
failed_reason: String::new(),
},
true,
),
@@ -151,8 +150,8 @@ mod tests {
total_gigabytes: 1024,
quantity: 1,
individual_size_gigabytes: 1024,
parse_engine: 0,
needed_description_check: true,
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
failed_reason: String::new(),
},
false, // Sadly this one fails :/
),
@@ -164,8 +163,8 @@ mod tests {
total_gigabytes: 7 * 1024,
quantity: 1,
individual_size_gigabytes: 7 * 1024,
parse_engine: 0,
needed_description_check: false,
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
failed_reason: String::new(),
},
true,
),
@@ -177,8 +176,8 @@ mod tests {
total_gigabytes: 6 * 256,
quantity: 6,
individual_size_gigabytes: 256,
parse_engine: 0,
needed_description_check: false,
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
failed_reason: String::new(),
},
true,
),
@@ -190,8 +189,8 @@ mod tests {
total_gigabytes: 1966,
quantity: 1,
individual_size_gigabytes: 1966,
parse_engine: 0,
needed_description_check: false,
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
failed_reason: String::new(),
},
true,
),

View File

@@ -1,10 +1,10 @@
use crate::db::ParsedLLMStorageResult;
use crate::db::ParsedStorage;
use actix_web::mime::APPLICATION_JSON;
use reqwest::header::AUTHORIZATION;
use reqwest::header::CONTENT_TYPE;
use crate::db::{
ParsedLLMStorageResult, ParsedStorage, StorageLLMVersion, StorageParsingEngineVersion,
};
use reqwest::header::{AUTHORIZATION, CONTENT_TYPE};
use serde::{Deserialize, Serialize};
use serde_json::json;
use tracing::error;
// Given this prompt and a string of "(Lot of 6) Samsung MZ-VLB2560 256GB M.2NVMe Internal SSD
// (MZVLB256HBHQ-000H1)", we get 338 input tokens and 36 output tokens. Assuming no caching, then
@@ -28,22 +28,6 @@ And an example for an unclear title of "Pallet of assorted 128GB to 5TB drives";
}
"#;
fn create_request(title: &str) -> serde_json::Value {
json!({
"model": "gemini-2.5-flash-lite",
"messages": [
{
"role": "system",
"content": SYSTEM_PROMPT
},
{
"role": "user",
"content": title
}
]
})
}
#[derive(Deserialize, Serialize, Debug, PartialEq, Clone)]
struct LLMParsedResponse {
pub quantity: i64,
@@ -66,40 +50,111 @@ struct OpenAIMessage {
content: String,
}
/// Parses size and quantity information from an item title.
pub async fn parse_size_and_quantity(
db: &rusqlite::Connection,
#[cfg(test)]
const OPENAI_LLM_URL: &str = "https://badurl.hak8or.com/litellm_api/chat/completions";
#[cfg(not(test))]
const OPENAI_LLM_URL: &str = "https://ai.hak8or.com/litellm_api/chat/completions";
#[cfg(test)]
const OPENAI_LLM_API_KEY: &str = "Bearer sk-YmVlcC1ib29wLWEtcm9ib3Q";
#[cfg(not(test))]
const OPENAI_LLM_API_KEY: &str = "Bearer sk-HMGML94x2ag6ggOoDghSGA";
pub async fn parse_size_and_quantity_llm(
item_id: i64,
title: &str,
) -> Option<ParsedStorage> {
) -> Option<ParsedLLMStorageResult> {
let client = reqwest::Client::new();
let req = client
.post("https://ai.hak8or.com/litellm_api/chat/completions")
.header(CONTENT_TYPE, APPLICATION_JSON.to_string())
.header(AUTHORIZATION, "Bearer sk-HMGML94x2ag6ggOoDghSGA")
.body(create_request(title).to_string());
.post(OPENAI_LLM_URL)
.header(CONTENT_TYPE, actix_web::mime::APPLICATION_JSON.to_string())
.header(AUTHORIZATION, OPENAI_LLM_API_KEY)
.body(
json!({
"model": "gemini-2.5-flash-lite",
"reasoning_effort": "disable",
"thinking": {"type": "disabled", "budget_tokens": 0},
"messages": [
{ "role": "system", "content": SYSTEM_PROMPT },
{ "role": "user", "content": title }
]
})
.to_string(),
);
let reply_body = req.send().await.ok()?.text().await.ok()?;
let repl_json: OpenAIResponse = serde_json::from_str(&reply_body).ok()?;
match repl_json.choices.len() {
0 => {
error!("When parsing title, LLM returned ZERO choices");
return None;
}
1 => { /* Nothing to do */ }
a => error!("When parsing title, LLM returned {a}, >1 choices, using first!"),
}
let reply_parsed_storage_json: LLMParsedResponse =
serde_json::from_str(&repl_json.choices[0].message.content).ok()?;
let plsr = ParsedLLMStorageResult {
if !reply_parsed_storage_json.fail_reason.is_empty() {
error!(
"Failed parsing item_id:{item_id}, title:{title}, due to reason:{}",
reply_parsed_storage_json.fail_reason
);
}
Some(ParsedLLMStorageResult {
id: 0,
fail_reason: reply_parsed_storage_json.fail_reason.clone(),
gigabytes: reply_parsed_storage_json.gigabytes,
item_id,
quantity: reply_parsed_storage_json.quantity,
title: title.to_owned(),
};
llm_id: StorageLLMVersion::Gemini2d5Prompt0,
})
}
// Since we can't have a hashmap in a const, and I don't want to play with
// making our parsed result struct contain a CoW string for fail_reason and
// title, we are stuck with this ...
pub fn parse_cached(item_id: i64, title: &str) -> Option<ParsedLLMStorageResult> {
match title {
"Lot of 2 512GB SSD 6gb/s working with 5% wear" => Some(ParsedLLMStorageResult {
id: 0,
item_id: item_id,
fail_reason: "".to_string(),
gigabytes: 512,
quantity: 2,
title: title.to_owned(),
llm_id: StorageLLMVersion::Testing,
}),
"Lot of 2 assorted SSD" => Some(ParsedLLMStorageResult {
id: 0,
fail_reason: "mixed sizes".to_owned(),
gigabytes: 0,
item_id,
quantity: 0,
title: title.to_owned(),
llm_id: StorageLLMVersion::Testing,
}),
_ => None,
}
}
/// Parses size and quantity information from an item title.
pub async fn parse_size_and_quantity(
db: &rusqlite::Connection,
item_id: i64,
title: &str,
) -> Option<ParsedStorage> {
let plsr = parse_size_and_quantity_llm(item_id, title).await?;
plsr.add_or_update(&db);
Some(ParsedStorage {
id: 0,
item: item_id,
total_gigabytes: reply_parsed_storage_json.quantity * reply_parsed_storage_json.gigabytes,
quantity: reply_parsed_storage_json.quantity,
individual_size_gigabytes: reply_parsed_storage_json.gigabytes,
needed_description_check: !reply_parsed_storage_json.fail_reason.is_empty(),
parse_engine: 1,
total_gigabytes: plsr.quantity * plsr.gigabytes,
quantity: plsr.quantity,
individual_size_gigabytes: plsr.gigabytes,
failed_reason: plsr.fail_reason,
parse_engine: StorageParsingEngineVersion::LLM,
})
}