Moaaarrr
All checks were successful
Cargo Build & Test / Rust project - latest (1.86) (push) Successful in 4m26s
Cargo Build & Test / Rust project - latest (1.87) (push) Successful in 4m42s
Cargo Build & Test / Rust project - latest (1.85.1) (push) Successful in 5m15s
Cargo Build & Test / Rust project - latest (1.88) (push) Successful in 11m40s
All checks were successful
Cargo Build & Test / Rust project - latest (1.86) (push) Successful in 4m26s
Cargo Build & Test / Rust project - latest (1.87) (push) Successful in 4m42s
Cargo Build & Test / Rust project - latest (1.85.1) (push) Successful in 5m15s
Cargo Build & Test / Rust project - latest (1.88) (push) Successful in 11m40s
This commit is contained in:
98
Cargo.lock
generated
98
Cargo.lock
generated
@@ -695,7 +695,9 @@ dependencies = [
|
||||
"chrono",
|
||||
"clap",
|
||||
"dirs",
|
||||
"futures",
|
||||
"lazy_static",
|
||||
"num_enum",
|
||||
"rayon",
|
||||
"regex",
|
||||
"reqwest",
|
||||
@@ -848,6 +850,21 @@ dependencies = [
|
||||
"new_debug_unreachable",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures"
|
||||
version = "0.3.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-executor",
|
||||
"futures-io",
|
||||
"futures-sink",
|
||||
"futures-task",
|
||||
"futures-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-channel"
|
||||
version = "0.3.31"
|
||||
@@ -864,12 +881,34 @@ version = "0.3.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e"
|
||||
|
||||
[[package]]
|
||||
name = "futures-executor"
|
||||
version = "0.3.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f"
|
||||
dependencies = [
|
||||
"futures-core",
|
||||
"futures-task",
|
||||
"futures-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-io"
|
||||
version = "0.3.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
|
||||
|
||||
[[package]]
|
||||
name = "futures-macro"
|
||||
version = "0.3.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futures-sink"
|
||||
version = "0.3.31"
|
||||
@@ -888,8 +927,10 @@ version = "0.3.31"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81"
|
||||
dependencies = [
|
||||
"futures-channel",
|
||||
"futures-core",
|
||||
"futures-io",
|
||||
"futures-macro",
|
||||
"futures-sink",
|
||||
"futures-task",
|
||||
"memchr",
|
||||
@@ -1566,6 +1607,28 @@ dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num_enum"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a973b4e44ce6cad84ce69d797acf9a044532e4184c4f267913d1b546a0727b7a"
|
||||
dependencies = [
|
||||
"num_enum_derive",
|
||||
"rustversion",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num_enum_derive"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77e878c846a8abae00dd069496dbe8751b16ac1c3d6bd2a7283a938e8228f90d"
|
||||
dependencies = [
|
||||
"proc-macro-crate",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "object"
|
||||
version = "0.36.7"
|
||||
@@ -1772,6 +1835,15 @@ version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-crate"
|
||||
version = "3.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "edce586971a4dfaa28950c6f18ed55e0406c1ab88bbce2c6f6293a7aaba73d35"
|
||||
dependencies = [
|
||||
"toml_edit",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.95"
|
||||
@@ -2534,6 +2606,23 @@ dependencies = [
|
||||
"tokio",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "toml_datetime"
|
||||
version = "0.6.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c"
|
||||
|
||||
[[package]]
|
||||
name = "toml_edit"
|
||||
version = "0.22.27"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a"
|
||||
dependencies = [
|
||||
"indexmap",
|
||||
"toml_datetime",
|
||||
"winnow",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tower"
|
||||
version = "0.5.2"
|
||||
@@ -3009,6 +3098,15 @@ version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
||||
|
||||
[[package]]
|
||||
name = "winnow"
|
||||
version = "0.7.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "21a0236b59786fed61e2a80582dd500fe61f18b5dca67a4a067d0bc9039339cf"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wit-bindgen-rt"
|
||||
version = "0.39.0"
|
||||
|
@@ -8,7 +8,9 @@ actix-web = "4.11.0"
|
||||
chrono = { version = "0.4.41", features = ["serde"] }
|
||||
clap = { version = "4.5.40", features = ["derive"] }
|
||||
dirs = "6.0.0"
|
||||
futures = "0.3.31"
|
||||
lazy_static = "1.5.0"
|
||||
num_enum = "0.7.4"
|
||||
rayon = "1.10.0"
|
||||
regex = "1.11.1"
|
||||
reqwest = { version = "0.12.23", features = ["blocking"] }
|
||||
|
153
src/db.rs
153
src/db.rs
@@ -1,5 +1,8 @@
|
||||
use chrono::{DateTime, Utc};
|
||||
use num_enum::TryFromPrimitive;
|
||||
use rusqlite::Connection;
|
||||
use rusqlite::ToSql;
|
||||
use rusqlite::types::FromSql;
|
||||
use serde::Deserialize;
|
||||
use serde::Serialize;
|
||||
use std::path::Path;
|
||||
@@ -205,15 +208,34 @@ impl ParsedPage {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, PartialEq, Copy, Clone)]
|
||||
#[repr(i64)]
|
||||
#[derive(Serialize, Debug, PartialEq, Copy, Clone, PartialOrd, Ord, Eq, TryFromPrimitive)]
|
||||
pub enum StorageParsingEngineVersion {
|
||||
Testing = 0,
|
||||
Regex = 1,
|
||||
LLM = 2,
|
||||
}
|
||||
impl ToSql for StorageParsingEngineVersion {
|
||||
fn to_sql(&self) -> rusqlite::Result<rusqlite::types::ToSqlOutput<'_>> {
|
||||
Ok((*self as i64).into())
|
||||
}
|
||||
}
|
||||
impl FromSql for StorageParsingEngineVersion {
|
||||
fn column_result(value: rusqlite::types::ValueRef<'_>) -> rusqlite::types::FromSqlResult<Self> {
|
||||
let v = value.as_i64()?;
|
||||
Self::try_from(v).map_err(|_| rusqlite::types::FromSqlError::OutOfRange(v))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, PartialEq, Clone)]
|
||||
pub struct ParsedStorage {
|
||||
pub id: i64,
|
||||
pub item: i64,
|
||||
pub total_gigabytes: i64,
|
||||
pub quantity: i64,
|
||||
pub individual_size_gigabytes: i64,
|
||||
pub parse_engine: i64,
|
||||
pub needed_description_check: bool,
|
||||
pub parse_engine: StorageParsingEngineVersion,
|
||||
pub failed_reason: String,
|
||||
}
|
||||
impl DBTable for ParsedStorage {
|
||||
const TABLE_NAME: &'static str = "Storage_Parsed";
|
||||
@@ -224,13 +246,13 @@ impl DBTable for ParsedStorage {
|
||||
quantity INTEGER,
|
||||
sizes_gigabytes TEXT,
|
||||
parse_engine INTEGER,
|
||||
need_description_check INTEGER,
|
||||
failed_reason TEXT,
|
||||
UNIQUE(item, parse_engine)
|
||||
FOREIGN KEY(item) REFERENCES Listings(item_id)
|
||||
";
|
||||
|
||||
fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> {
|
||||
let mut stmt = conn.prepare(&format!("SELECT id, item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, need_description_check FROM {}", Self::TABLE_NAME))?;
|
||||
let mut stmt = conn.prepare(&format!("SELECT id, item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, failed_reason FROM {}", Self::TABLE_NAME))?;
|
||||
let iter = stmt.query_map([], |row| {
|
||||
Ok(ParsedStorage {
|
||||
id: row.get(0)?,
|
||||
@@ -242,7 +264,7 @@ impl DBTable for ParsedStorage {
|
||||
r.parse().unwrap_or(0)
|
||||
},
|
||||
parse_engine: row.get(5)?,
|
||||
needed_description_check: row.get(6)?,
|
||||
failed_reason: row.get(6)?,
|
||||
})
|
||||
})?;
|
||||
|
||||
@@ -273,7 +295,7 @@ impl ParsedStorage {
|
||||
r.parse().unwrap()
|
||||
},
|
||||
parse_engine: row.get(5)?,
|
||||
needed_description_check: row.get(6)?,
|
||||
failed_reason: row.get(6)?,
|
||||
})
|
||||
})
|
||||
.ok()
|
||||
@@ -283,21 +305,26 @@ impl ParsedStorage {
|
||||
}
|
||||
|
||||
pub fn add_or_update(&self, conn: &Connection) {
|
||||
let _ = conn.execute(&format!("
|
||||
let _ = conn
|
||||
.execute(
|
||||
&format!(
|
||||
"
|
||||
INSERT OR REPLACE INTO {}
|
||||
(item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, need_description_check)
|
||||
(item, total_gigabytes, quantity, sizes_gigabytes, parse_engine, failed_reason)
|
||||
VALUES
|
||||
(?1, ?2, ?3, ?4, ?5, ?6)",
|
||||
Self::TABLE_NAME),
|
||||
(
|
||||
&self.item,
|
||||
self.total_gigabytes,
|
||||
self.quantity,
|
||||
self.individual_size_gigabytes.to_string(),
|
||||
self.parse_engine,
|
||||
self.needed_description_check
|
||||
Self::TABLE_NAME
|
||||
),
|
||||
(
|
||||
&self.item,
|
||||
self.total_gigabytes,
|
||||
self.quantity,
|
||||
self.individual_size_gigabytes.to_string(),
|
||||
self.parse_engine,
|
||||
&self.failed_reason,
|
||||
),
|
||||
)
|
||||
).unwrap();
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -494,19 +521,40 @@ impl Listing {
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn lookup_non_parsed(conn: &Connection) -> Vec<(i64, String)> {
|
||||
let mut stmt = conn
|
||||
.prepare(&format!(
|
||||
"
|
||||
SELECT ei.item_id, ei.title FROM {} AS ei
|
||||
LEFT JOIN {} AS sp ON ei.item_id = sp.item
|
||||
WHERE sp.item IS NULL",
|
||||
Self::TABLE_NAME,
|
||||
ParsedStorage::TABLE_NAME
|
||||
))
|
||||
pub fn lookup_pending_parse(
|
||||
conn: &Connection,
|
||||
allowed_engines: &[i64],
|
||||
count_limit: u64,
|
||||
) -> Vec<(i64, String)> {
|
||||
let engines_filter = if !allowed_engines.is_empty() {
|
||||
format!(
|
||||
"AND ({})",
|
||||
allowed_engines
|
||||
.iter()
|
||||
.map(|e| "ps.parse_engine = ".to_owned() + &e.to_string())
|
||||
.collect::<Vec<_>>()
|
||||
.join(" OR ")
|
||||
)
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
|
||||
let query = format!(
|
||||
"
|
||||
SELECT listing.item_id, listing.title FROM {0} AS listing
|
||||
WHERE NOT EXISTS (
|
||||
SELECT 1 FROM {1} AS ps
|
||||
WHERE listing.item_id = ps.item {engines_filter}
|
||||
)
|
||||
LIMIT {count_limit}
|
||||
",
|
||||
Self::TABLE_NAME,
|
||||
ParsedStorage::TABLE_NAME
|
||||
);
|
||||
conn.prepare(&query)
|
||||
.ok()
|
||||
.unwrap();
|
||||
stmt.query_map([], |row| Ok((row.get(0)?, row.get(1)?)))
|
||||
.unwrap()
|
||||
.query_map([], |row| Ok((row.get(0)?, row.get(1)?)))
|
||||
.ok()
|
||||
.unwrap()
|
||||
.map(|e| e.unwrap())
|
||||
@@ -565,7 +613,7 @@ pub fn listings_get_filtered(
|
||||
history: ItemAppearances::lookup(conn, l.item_id),
|
||||
parsed: ParsedStorage::lookup(conn, l.item_id),
|
||||
})
|
||||
.filter(|lr| lr.parsed.iter().any(|p| !p.needed_description_check))
|
||||
.filter(|lr| lr.parsed.iter().any(|p| p.failed_reason.is_empty()))
|
||||
.collect::<Vec<ListingsFilterResult>>();
|
||||
info!(
|
||||
"Found total {} listings since (str:{} epoch:{})",
|
||||
@@ -614,6 +662,24 @@ pub fn listings_get_filtered(
|
||||
listings
|
||||
}
|
||||
|
||||
#[repr(i64)]
|
||||
#[derive(Serialize, Debug, PartialEq, Copy, Clone, TryFromPrimitive)]
|
||||
pub enum StorageLLMVersion {
|
||||
Testing = 0,
|
||||
Gemini2d5Prompt0 = 1,
|
||||
}
|
||||
impl ToSql for StorageLLMVersion {
|
||||
fn to_sql(&self) -> rusqlite::Result<rusqlite::types::ToSqlOutput<'_>> {
|
||||
Ok((*self as i64).into())
|
||||
}
|
||||
}
|
||||
impl FromSql for StorageLLMVersion {
|
||||
fn column_result(value: rusqlite::types::ValueRef<'_>) -> rusqlite::types::FromSqlResult<Self> {
|
||||
let v = value.as_i64()?;
|
||||
Self::try_from(v).map_err(|_| rusqlite::types::FromSqlError::OutOfRange(v))
|
||||
}
|
||||
}
|
||||
|
||||
// This is mostly meant as a way to cache all of these.
|
||||
#[derive(Serialize, Debug, PartialEq, Clone)]
|
||||
pub struct ParsedLLMStorageResult {
|
||||
@@ -623,6 +689,7 @@ pub struct ParsedLLMStorageResult {
|
||||
pub quantity: i64,
|
||||
pub gigabytes: i64,
|
||||
pub fail_reason: String,
|
||||
pub llm_id: StorageLLMVersion,
|
||||
}
|
||||
impl DBTable for ParsedLLMStorageResult {
|
||||
const TABLE_NAME: &'static str = "ParsedLLMStorageResult";
|
||||
@@ -632,12 +699,13 @@ impl DBTable for ParsedLLMStorageResult {
|
||||
title TEXT NOT NULL,
|
||||
quantity INTEGER NOT NULL,
|
||||
gigabytes INTEGER NOT NULL,
|
||||
fail_reason TEXT NOT NULL
|
||||
fail_reason TEXT NOT NULL,
|
||||
llm_id INTEGER NOT NULL
|
||||
";
|
||||
|
||||
fn get_all(conn: &Connection) -> rusqlite::Result<Vec<Self>> {
|
||||
let mut stmt = conn.prepare(&format!(
|
||||
"SELECT id, item_id, title, quantity, gigabytes, fail_reason FROM {}",
|
||||
"SELECT id, item_id, title, quantity, gigabytes, fail_reason, llm_id FROM {}",
|
||||
Self::TABLE_NAME
|
||||
))?;
|
||||
let iter = stmt.query_map([], |row| {
|
||||
@@ -648,6 +716,7 @@ impl DBTable for ParsedLLMStorageResult {
|
||||
quantity: row.get(3)?,
|
||||
gigabytes: row.get(4)?,
|
||||
fail_reason: row.get(5)?,
|
||||
llm_id: row.get(6)?,
|
||||
})
|
||||
})?;
|
||||
|
||||
@@ -674,6 +743,7 @@ impl ParsedLLMStorageResult {
|
||||
quantity: row.get(3)?,
|
||||
gigabytes: row.get(4)?,
|
||||
fail_reason: row.get(5)?,
|
||||
llm_id: row.get(6)?,
|
||||
})
|
||||
})
|
||||
.ok()
|
||||
@@ -689,9 +759,10 @@ impl ParsedLLMStorageResult {
|
||||
title,
|
||||
quantity,
|
||||
gigabytes,
|
||||
fail_reason
|
||||
fail_reason,
|
||||
llm_id
|
||||
)
|
||||
VALUES (?1, ?2, ?3, ?4, ?5)",
|
||||
VALUES (?1, ?2, ?3, ?4, ?5, ?6)",
|
||||
Self::TABLE_NAME
|
||||
),
|
||||
(
|
||||
@@ -700,6 +771,7 @@ impl ParsedLLMStorageResult {
|
||||
self.quantity,
|
||||
self.gigabytes,
|
||||
self.fail_reason.clone(),
|
||||
self.llm_id,
|
||||
),
|
||||
)
|
||||
.unwrap();
|
||||
@@ -733,7 +805,7 @@ pub struct Stats {
|
||||
rows_parsed_storage: i64,
|
||||
rows_parsed_page: i64,
|
||||
rows_item_appearances: i64,
|
||||
// pub rows_parsed_storage_llm: i64,
|
||||
pub rows_parsed_storage_llm: i64,
|
||||
}
|
||||
|
||||
pub fn get_stats(conn: &Connection) -> Stats {
|
||||
@@ -743,7 +815,7 @@ pub fn get_stats(conn: &Connection) -> Stats {
|
||||
rows_parsed_storage: ParsedStorage::get_count(conn),
|
||||
rows_parsed_page: ParsedPage::get_count(conn),
|
||||
rows_item_appearances: ItemAppearances::get_count(conn),
|
||||
// rows_parsed_storage_llm: ParsedLLMStorageResult::get_count(conn),
|
||||
rows_parsed_storage_llm: ParsedLLMStorageResult::get_count(conn),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -765,7 +837,7 @@ mod tests {
|
||||
let listing = Listing {
|
||||
id: 1,
|
||||
item_id: 1234,
|
||||
title: "Some Title".to_string(),
|
||||
title: "Lot of 2 512GB SSD 6gb/s working with 5% wear".to_string(),
|
||||
buy_it_now_price_cents: Some(123),
|
||||
has_best_offer: false,
|
||||
image_url: "google.com".to_string(),
|
||||
@@ -779,8 +851,8 @@ mod tests {
|
||||
total_gigabytes: 13,
|
||||
quantity: 3,
|
||||
individual_size_gigabytes: 13,
|
||||
parse_engine: 9,
|
||||
needed_description_check: true,
|
||||
parse_engine: StorageParsingEngineVersion::Testing,
|
||||
failed_reason: "".to_owned(),
|
||||
};
|
||||
parsed.add_or_update(&db);
|
||||
assert_eq!(ParsedStorage::lookup(&db, listing.item_id), vec![parsed]);
|
||||
@@ -811,6 +883,7 @@ mod tests {
|
||||
item_id: 12345,
|
||||
quantity: 32,
|
||||
title: "Some Title".to_owned(),
|
||||
llm_id: StorageLLMVersion::Testing,
|
||||
};
|
||||
parsedllmstorage.add_or_update(&db);
|
||||
assert_eq!(
|
||||
|
120
src/main.rs
120
src/main.rs
@@ -1,12 +1,11 @@
|
||||
use actix_web::{App, HttpServer, Responder, Result, get, post, web, web::Data};
|
||||
use chrono::{DateTime, Utc};
|
||||
use clap::Parser;
|
||||
use ebay_scraper_rust::db::{
|
||||
DBTable, ItemAppearances, Listing, ParsedLLMStorageResult, ParsedPage, ParsedStorage,
|
||||
SearchURL, get_initialized, get_stats, listings_get_filtered,
|
||||
};
|
||||
use ebay_scraper_rust::db;
|
||||
use ebay_scraper_rust::db::DBTable;
|
||||
use ebay_scraper_rust::parser::parse_dir;
|
||||
use ebay_scraper_rust::{parser_storage_e0, parser_storage_e1};
|
||||
use futures::future::join_all;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Mutex;
|
||||
@@ -32,6 +31,7 @@ struct AppCtx {
|
||||
db: rusqlite::Connection,
|
||||
db_llm: rusqlite::Connection,
|
||||
download_dir: PathBuf,
|
||||
llm_parser: Option<actix_web::rt::task::JoinHandle<()>>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug)]
|
||||
@@ -47,7 +47,7 @@ async fn listings_filtered_get(
|
||||
filter: web::Query<ListingsFilter>,
|
||||
) -> Result<impl Responder> {
|
||||
let start = Instant::now();
|
||||
let res = listings_get_filtered(
|
||||
let res = db::listings_get_filtered(
|
||||
&ctx.lock().unwrap().db,
|
||||
&DateTime::<Utc>::from_timestamp(filter.since.unwrap_or(0), 0).unwrap(),
|
||||
filter.limit.unwrap_or(1_000),
|
||||
@@ -64,12 +64,12 @@ async fn listings_filtered_get(
|
||||
|
||||
#[get("/listing/{id}")]
|
||||
async fn listing_get(ctx: Data<Mutex<AppCtx>>, id: web::Path<i64>) -> Result<impl Responder> {
|
||||
Ok(web::Json(Listing::lookup(&ctx.lock().unwrap().db, *id)))
|
||||
Ok(web::Json(db::Listing::lookup(&ctx.lock().unwrap().db, *id)))
|
||||
}
|
||||
|
||||
#[get("/listing/{id}/parsed")]
|
||||
async fn listing_parse_get(ctx: Data<Mutex<AppCtx>>, id: web::Path<i64>) -> Result<impl Responder> {
|
||||
Ok(web::Json(ParsedStorage::lookup(
|
||||
Ok(web::Json(db::ParsedStorage::lookup(
|
||||
&ctx.lock().unwrap().db,
|
||||
*id,
|
||||
)))
|
||||
@@ -86,7 +86,7 @@ async fn listing_history_get(
|
||||
ctx: Data<Mutex<AppCtx>>,
|
||||
id: web::Path<i64>,
|
||||
) -> Result<impl Responder> {
|
||||
let history: Vec<_> = ItemAppearances::lookup(&ctx.lock().unwrap().db, *id)
|
||||
let history: Vec<_> = db::ItemAppearances::lookup(&ctx.lock().unwrap().db, *id)
|
||||
.iter()
|
||||
// .inspect(|e| info!("got: {:?}", e))
|
||||
.filter_map(|e| {
|
||||
@@ -99,29 +99,85 @@ async fn listing_history_get(
|
||||
Ok(web::Json(history))
|
||||
}
|
||||
|
||||
async fn storage_parse_work(entries: &[(i64, String)]) -> Vec<db::ParsedStorage> {
|
||||
let llm_futures: Vec<_> = entries
|
||||
.iter()
|
||||
.map(|(id, title)| parser_storage_e1::parse_size_and_quantity_llm(*id, title))
|
||||
.collect();
|
||||
let llm_future_results = join_all(llm_futures).await;
|
||||
let llm_results = llm_future_results
|
||||
.iter()
|
||||
.flatten()
|
||||
.map(|e| db::ParsedStorage {
|
||||
id: 0,
|
||||
item: e.item_id,
|
||||
total_gigabytes: e.quantity * e.gigabytes,
|
||||
quantity: e.quantity,
|
||||
individual_size_gigabytes: e.gigabytes,
|
||||
failed_reason: e.fail_reason.clone(),
|
||||
parse_engine: db::StorageParsingEngineVersion::LLM,
|
||||
});
|
||||
// .inspect(|e| e.add_or_update(&unlocked.db))
|
||||
// .map(|e| db::ParsedStorage {
|
||||
// id: 0,
|
||||
// item: e.item_id,
|
||||
// total_gigabytes: e.quantity * e.gigabytes,
|
||||
// quantity: e.quantity,
|
||||
// individual_size_gigabytes: e.gigabytes,
|
||||
// needed_description_check: !e.fail_reason.is_empty(),
|
||||
// parse_engine: db::StorageParsingEngineVersion::LLM,
|
||||
// })
|
||||
// .for_each(|e| e.add_or_update(&unlocked.db));
|
||||
|
||||
// And a regex based parse.
|
||||
let regex_results = entries
|
||||
.iter()
|
||||
.map(|(id, title)| parser_storage_e0::parse_size_and_quantity(*id, &title));
|
||||
// .for_each(|e| e.add_or_update(&unlocked.db));
|
||||
|
||||
regex_results.chain(llm_results).collect()
|
||||
}
|
||||
|
||||
fn storage_parse_worker(ctx: Data<Mutex<AppCtx>>) -> actix_web::rt::task::JoinHandle<()> {
|
||||
actix_web::rt::spawn(async move {
|
||||
loop {
|
||||
actix_web::rt::time::sleep(std::time::Duration::from_millis(1000)).await;
|
||||
let ctx_unlocked = ctx.lock().unwrap();
|
||||
let entries = db::Listing::lookup_pending_parse(&ctx_unlocked.db, &[], 10);
|
||||
let parsed = storage_parse_work(entries.as_slice()).await;
|
||||
for p in parsed {
|
||||
p.add_or_update(&ctx_unlocked.db);
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
#[post("/listing/parse")]
|
||||
async fn parse_listings(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
|
||||
// Prepare a background parser to go through and use an LLM to parse the
|
||||
// storage info.
|
||||
if ctx.lock().unwrap().llm_parser.is_none() {
|
||||
ctx.clone().lock().unwrap().llm_parser = Some(storage_parse_worker(ctx.clone()));
|
||||
}
|
||||
|
||||
// Lets grab a few entries and then try parsing them with two engines.
|
||||
let ctx_locked = ctx.lock().unwrap();
|
||||
let entries: Vec<_> = Listing::lookup_non_parsed(&ctx_locked.db)
|
||||
let entries: Vec<_> = db::Listing::lookup_pending_parse(&ctx_locked.db, &[], 100)
|
||||
.iter()
|
||||
.take(10)
|
||||
.map(|e| e.clone())
|
||||
.collect();
|
||||
for (item_id, title) in &entries {
|
||||
let ps0 = parser_storage_e0::parse_size_and_quantity(*item_id, &title);
|
||||
ps0.add_or_update(&ctx_locked.db);
|
||||
|
||||
let ps1 =
|
||||
parser_storage_e1::parse_size_and_quantity(&ctx_locked.db_llm, *item_id, &title).await;
|
||||
if ps1.is_some() {
|
||||
info!(
|
||||
"Parsed using an LLM title:{} and results:{:?}",
|
||||
title,
|
||||
ps1.unwrap()
|
||||
);
|
||||
// info!(
|
||||
// "Parsed using an LLM title:{} and results:{:?}",
|
||||
// title,
|
||||
// ps1.unwrap()
|
||||
// );
|
||||
ps1.unwrap().add_or_update(&ctx_locked.db);
|
||||
ps1.unwrap().add_or_update(&ctx_locked.db_llm);
|
||||
// ps1.unwrap().add_or_update(&ctx_locked.db_llm); No need
|
||||
} else {
|
||||
error!("Failed to parse {item_id} with title {title}");
|
||||
}
|
||||
@@ -131,7 +187,7 @@ async fn parse_listings(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
|
||||
|
||||
#[get("/category")]
|
||||
async fn category_getnames(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
|
||||
Ok(web::Json(SearchURL::names(&ctx.lock().unwrap().db)))
|
||||
Ok(web::Json(db::SearchURL::names(&ctx.lock().unwrap().db)))
|
||||
}
|
||||
|
||||
#[post("/category/{category}/parse")]
|
||||
@@ -156,9 +212,9 @@ async fn category_parse(
|
||||
|
||||
#[get("/stats")]
|
||||
async fn stats_get(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
|
||||
let stats_db = get_stats(&ctx.lock().unwrap().db);
|
||||
// let stats_db_llm = get_stats(&ctx.lock().unwrap().db_llm);
|
||||
// stats_db.rows_parsed_storage_llm = stats_db_llm.rows_parsed_storage_llm;
|
||||
let mut stats_db = db::get_stats(&ctx.lock().unwrap().db);
|
||||
let stats_db_llm = db::get_stats(&ctx.lock().unwrap().db_llm);
|
||||
stats_db.rows_parsed_storage_llm = stats_db_llm.rows_parsed_storage_llm;
|
||||
Ok(web::Json(stats_db))
|
||||
}
|
||||
|
||||
@@ -166,11 +222,11 @@ async fn stats_get(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
|
||||
async fn admin_get(ctx: Data<Mutex<AppCtx>>) -> Result<impl Responder> {
|
||||
let ctx_locked = ctx.lock().unwrap();
|
||||
let query_start_time = Instant::now();
|
||||
let search_urls = SearchURL::get_all(&ctx_locked.db).unwrap_or_default();
|
||||
let parsed_pages = ParsedPage::get_all(&ctx_locked.db).unwrap_or_default();
|
||||
let parsed_storages = ParsedStorage::get_all(&ctx_locked.db).unwrap_or_default();
|
||||
let item_appearances = ItemAppearances::get_all(&ctx_locked.db).unwrap_or_default();
|
||||
let listings = Listing::get_all(&ctx_locked.db).unwrap_or_default();
|
||||
let search_urls = db::SearchURL::get_all(&ctx_locked.db).unwrap_or_default();
|
||||
let parsed_pages = db::ParsedPage::get_all(&ctx_locked.db).unwrap_or_default();
|
||||
let parsed_storages = db::ParsedStorage::get_all(&ctx_locked.db).unwrap_or_default();
|
||||
let item_appearances = db::ItemAppearances::get_all(&ctx_locked.db).unwrap_or_default();
|
||||
let listings = db::Listing::get_all(&ctx_locked.db).unwrap_or_default();
|
||||
let total_query_time = query_start_time.elapsed().as_micros() as f64 / 1000.0;
|
||||
|
||||
let html_gen_start_time = Instant::now();
|
||||
@@ -313,17 +369,19 @@ async fn main() -> std::io::Result<()> {
|
||||
|
||||
let app_data = Data::new(Mutex::new(AppCtx {
|
||||
download_dir: scrapedatadir.clone(),
|
||||
db: get_initialized(None),
|
||||
db: db::get_initialized(None),
|
||||
db_llm: {
|
||||
let db_path = scrapedatadir.with_file_name("llm.sqlite");
|
||||
let db = rusqlite::Connection::open(db_path).unwrap();
|
||||
ParsedLLMStorageResult::initialize(&db);
|
||||
let db = rusqlite::Connection::open(&db_path).unwrap();
|
||||
db::ParsedLLMStorageResult::initialize(&db);
|
||||
info!("Created {:?} for caching LLM parsed title.", db_path);
|
||||
db
|
||||
},
|
||||
llm_parser: None,
|
||||
}));
|
||||
|
||||
// Prepare our backend via pulling in what catagories we are preconfigured with.
|
||||
SearchURL::scan(&app_data.lock().unwrap().db, &scrapedatadir, "url.json");
|
||||
db::SearchURL::scan(&app_data.lock().unwrap().db, &scrapedatadir, "url.json");
|
||||
|
||||
HttpServer::new(move || {
|
||||
App::new()
|
||||
|
@@ -32,7 +32,7 @@ pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage {
|
||||
let upper_title = title.to_uppercase();
|
||||
let mut total_gb = 0i64;
|
||||
let mut quantity = 1i64;
|
||||
let mut needed_description_check = false;
|
||||
let mut failed_reason = String::new();
|
||||
let mut individual_size_gb = 0i64;
|
||||
|
||||
for pattern in EXPLICIT_QTY_PATTERNS.iter() {
|
||||
@@ -68,36 +68,35 @@ pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage {
|
||||
if !unique_sizes_gb.is_empty() {
|
||||
individual_size_gb = unique_sizes_gb[0];
|
||||
if unique_sizes_gb.len() > 1 {
|
||||
needed_description_check = true;
|
||||
failed_reason = "Mixed Sizes".to_owned();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if SIZE_RANGE_REGEX.is_match(&upper_title) {
|
||||
needed_description_check = true;
|
||||
failed_reason = "No Size Given".to_owned();
|
||||
}
|
||||
if quantity > 1 && upper_title.contains("MIXED") {
|
||||
needed_description_check = true;
|
||||
failed_reason = "Mixed Sizes".to_owned();
|
||||
}
|
||||
if upper_title.contains("CHECK THE DESCRIPTION")
|
||||
|| upper_title.contains("CHECK DESCRIPTION")
|
||||
|| upper_title.contains("SEE DESCRIPTION")
|
||||
{
|
||||
if quantity > 1 || size_matches.is_empty() || size_matches.len() > 1 {
|
||||
needed_description_check = true;
|
||||
failed_reason = "Mixed Sizes".to_owned();
|
||||
}
|
||||
}
|
||||
if upper_title.contains("READ") {
|
||||
failed_reason = "Mixed Sizes".to_owned();
|
||||
}
|
||||
|
||||
if individual_size_gb > 0 {
|
||||
total_gb = individual_size_gb * quantity;
|
||||
}
|
||||
|
||||
if quantity > 1 && total_gb == 0 && !size_matches.is_empty() {
|
||||
needed_description_check = true;
|
||||
}
|
||||
|
||||
if quantity == 1 && size_matches.len() == 1 && !needed_description_check {
|
||||
// This condition is implicitly handled
|
||||
failed_reason = "No size given".to_owned();
|
||||
}
|
||||
|
||||
ParsedStorage {
|
||||
@@ -106,8 +105,8 @@ pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage {
|
||||
total_gigabytes: total_gb,
|
||||
quantity,
|
||||
individual_size_gigabytes: individual_size_gb,
|
||||
needed_description_check,
|
||||
parse_engine: 0,
|
||||
failed_reason: failed_reason,
|
||||
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -125,8 +124,8 @@ mod tests {
|
||||
total_gigabytes: 512 * 3,
|
||||
quantity: 3,
|
||||
individual_size_gigabytes: 512,
|
||||
parse_engine: 0,
|
||||
needed_description_check: false,
|
||||
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
|
||||
failed_reason: String::new(),
|
||||
},
|
||||
true,
|
||||
),
|
||||
@@ -138,8 +137,8 @@ mod tests {
|
||||
total_gigabytes: 240,
|
||||
quantity: 1,
|
||||
individual_size_gigabytes: 240,
|
||||
parse_engine: 0,
|
||||
needed_description_check: false,
|
||||
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
|
||||
failed_reason: String::new(),
|
||||
},
|
||||
true,
|
||||
),
|
||||
@@ -151,8 +150,8 @@ mod tests {
|
||||
total_gigabytes: 1024,
|
||||
quantity: 1,
|
||||
individual_size_gigabytes: 1024,
|
||||
parse_engine: 0,
|
||||
needed_description_check: true,
|
||||
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
|
||||
failed_reason: String::new(),
|
||||
},
|
||||
false, // Sadly this one fails :/
|
||||
),
|
||||
@@ -164,8 +163,8 @@ mod tests {
|
||||
total_gigabytes: 7 * 1024,
|
||||
quantity: 1,
|
||||
individual_size_gigabytes: 7 * 1024,
|
||||
parse_engine: 0,
|
||||
needed_description_check: false,
|
||||
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
|
||||
failed_reason: String::new(),
|
||||
},
|
||||
true,
|
||||
),
|
||||
@@ -177,8 +176,8 @@ mod tests {
|
||||
total_gigabytes: 6 * 256,
|
||||
quantity: 6,
|
||||
individual_size_gigabytes: 256,
|
||||
parse_engine: 0,
|
||||
needed_description_check: false,
|
||||
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
|
||||
failed_reason: String::new(),
|
||||
},
|
||||
true,
|
||||
),
|
||||
@@ -190,8 +189,8 @@ mod tests {
|
||||
total_gigabytes: 1966,
|
||||
quantity: 1,
|
||||
individual_size_gigabytes: 1966,
|
||||
parse_engine: 0,
|
||||
needed_description_check: false,
|
||||
parse_engine: crate::db::StorageParsingEngineVersion::Regex,
|
||||
failed_reason: String::new(),
|
||||
},
|
||||
true,
|
||||
),
|
||||
|
@@ -1,10 +1,10 @@
|
||||
use crate::db::ParsedLLMStorageResult;
|
||||
use crate::db::ParsedStorage;
|
||||
use actix_web::mime::APPLICATION_JSON;
|
||||
use reqwest::header::AUTHORIZATION;
|
||||
use reqwest::header::CONTENT_TYPE;
|
||||
use crate::db::{
|
||||
ParsedLLMStorageResult, ParsedStorage, StorageLLMVersion, StorageParsingEngineVersion,
|
||||
};
|
||||
use reqwest::header::{AUTHORIZATION, CONTENT_TYPE};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::json;
|
||||
use tracing::error;
|
||||
|
||||
// Given this prompt and a string of "(Lot of 6) Samsung MZ-VLB2560 256GB M.2NVMe Internal SSD
|
||||
// (MZVLB256HBHQ-000H1)", we get 338 input tokens and 36 output tokens. Assuming no caching, then
|
||||
@@ -28,22 +28,6 @@ And an example for an unclear title of "Pallet of assorted 128GB to 5TB drives";
|
||||
}
|
||||
"#;
|
||||
|
||||
fn create_request(title: &str) -> serde_json::Value {
|
||||
json!({
|
||||
"model": "gemini-2.5-flash-lite",
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": SYSTEM_PROMPT
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": title
|
||||
}
|
||||
]
|
||||
})
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Serialize, Debug, PartialEq, Clone)]
|
||||
struct LLMParsedResponse {
|
||||
pub quantity: i64,
|
||||
@@ -66,40 +50,111 @@ struct OpenAIMessage {
|
||||
content: String,
|
||||
}
|
||||
|
||||
/// Parses size and quantity information from an item title.
|
||||
pub async fn parse_size_and_quantity(
|
||||
db: &rusqlite::Connection,
|
||||
#[cfg(test)]
|
||||
const OPENAI_LLM_URL: &str = "https://badurl.hak8or.com/litellm_api/chat/completions";
|
||||
#[cfg(not(test))]
|
||||
const OPENAI_LLM_URL: &str = "https://ai.hak8or.com/litellm_api/chat/completions";
|
||||
|
||||
#[cfg(test)]
|
||||
const OPENAI_LLM_API_KEY: &str = "Bearer sk-YmVlcC1ib29wLWEtcm9ib3Q";
|
||||
#[cfg(not(test))]
|
||||
const OPENAI_LLM_API_KEY: &str = "Bearer sk-HMGML94x2ag6ggOoDghSGA";
|
||||
|
||||
pub async fn parse_size_and_quantity_llm(
|
||||
item_id: i64,
|
||||
title: &str,
|
||||
) -> Option<ParsedStorage> {
|
||||
) -> Option<ParsedLLMStorageResult> {
|
||||
let client = reqwest::Client::new();
|
||||
let req = client
|
||||
.post("https://ai.hak8or.com/litellm_api/chat/completions")
|
||||
.header(CONTENT_TYPE, APPLICATION_JSON.to_string())
|
||||
.header(AUTHORIZATION, "Bearer sk-HMGML94x2ag6ggOoDghSGA")
|
||||
.body(create_request(title).to_string());
|
||||
.post(OPENAI_LLM_URL)
|
||||
.header(CONTENT_TYPE, actix_web::mime::APPLICATION_JSON.to_string())
|
||||
.header(AUTHORIZATION, OPENAI_LLM_API_KEY)
|
||||
.body(
|
||||
json!({
|
||||
"model": "gemini-2.5-flash-lite",
|
||||
"reasoning_effort": "disable",
|
||||
"thinking": {"type": "disabled", "budget_tokens": 0},
|
||||
"messages": [
|
||||
{ "role": "system", "content": SYSTEM_PROMPT },
|
||||
{ "role": "user", "content": title }
|
||||
]
|
||||
})
|
||||
.to_string(),
|
||||
);
|
||||
let reply_body = req.send().await.ok()?.text().await.ok()?;
|
||||
let repl_json: OpenAIResponse = serde_json::from_str(&reply_body).ok()?;
|
||||
match repl_json.choices.len() {
|
||||
0 => {
|
||||
error!("When parsing title, LLM returned ZERO choices");
|
||||
return None;
|
||||
}
|
||||
1 => { /* Nothing to do */ }
|
||||
a => error!("When parsing title, LLM returned {a}, >1 choices, using first!"),
|
||||
}
|
||||
let reply_parsed_storage_json: LLMParsedResponse =
|
||||
serde_json::from_str(&repl_json.choices[0].message.content).ok()?;
|
||||
|
||||
let plsr = ParsedLLMStorageResult {
|
||||
if !reply_parsed_storage_json.fail_reason.is_empty() {
|
||||
error!(
|
||||
"Failed parsing item_id:{item_id}, title:{title}, due to reason:{}",
|
||||
reply_parsed_storage_json.fail_reason
|
||||
);
|
||||
}
|
||||
|
||||
Some(ParsedLLMStorageResult {
|
||||
id: 0,
|
||||
fail_reason: reply_parsed_storage_json.fail_reason.clone(),
|
||||
gigabytes: reply_parsed_storage_json.gigabytes,
|
||||
item_id,
|
||||
quantity: reply_parsed_storage_json.quantity,
|
||||
title: title.to_owned(),
|
||||
};
|
||||
llm_id: StorageLLMVersion::Gemini2d5Prompt0,
|
||||
})
|
||||
}
|
||||
|
||||
// Since we can't have a hashmap in a const, and I don't want to play with
|
||||
// making our parsed result struct contain a CoW string for fail_reason and
|
||||
// title, we are stuck with this ...
|
||||
pub fn parse_cached(item_id: i64, title: &str) -> Option<ParsedLLMStorageResult> {
|
||||
match title {
|
||||
"Lot of 2 512GB SSD 6gb/s working with 5% wear" => Some(ParsedLLMStorageResult {
|
||||
id: 0,
|
||||
item_id: item_id,
|
||||
fail_reason: "".to_string(),
|
||||
gigabytes: 512,
|
||||
quantity: 2,
|
||||
title: title.to_owned(),
|
||||
llm_id: StorageLLMVersion::Testing,
|
||||
}),
|
||||
"Lot of 2 assorted SSD" => Some(ParsedLLMStorageResult {
|
||||
id: 0,
|
||||
fail_reason: "mixed sizes".to_owned(),
|
||||
gigabytes: 0,
|
||||
item_id,
|
||||
quantity: 0,
|
||||
title: title.to_owned(),
|
||||
llm_id: StorageLLMVersion::Testing,
|
||||
}),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Parses size and quantity information from an item title.
|
||||
pub async fn parse_size_and_quantity(
|
||||
db: &rusqlite::Connection,
|
||||
item_id: i64,
|
||||
title: &str,
|
||||
) -> Option<ParsedStorage> {
|
||||
let plsr = parse_size_and_quantity_llm(item_id, title).await?;
|
||||
plsr.add_or_update(&db);
|
||||
|
||||
Some(ParsedStorage {
|
||||
id: 0,
|
||||
item: item_id,
|
||||
total_gigabytes: reply_parsed_storage_json.quantity * reply_parsed_storage_json.gigabytes,
|
||||
quantity: reply_parsed_storage_json.quantity,
|
||||
individual_size_gigabytes: reply_parsed_storage_json.gigabytes,
|
||||
needed_description_check: !reply_parsed_storage_json.fail_reason.is_empty(),
|
||||
parse_engine: 1,
|
||||
total_gigabytes: plsr.quantity * plsr.gigabytes,
|
||||
quantity: plsr.quantity,
|
||||
individual_size_gigabytes: plsr.gigabytes,
|
||||
failed_reason: plsr.fail_reason,
|
||||
parse_engine: StorageParsingEngineVersion::LLM,
|
||||
})
|
||||
}
|
||||
|
Reference in New Issue
Block a user