Add LLM based parsing
All checks were successful
Cargo Build & Test / Rust project - latest (1.88) (push) Successful in 4m11s
Cargo Build & Test / Rust project - latest (1.87) (push) Successful in 4m34s
Cargo Build & Test / Rust project - latest (1.86) (push) Successful in 5m14s
Cargo Build & Test / Rust project - latest (1.85.1) (push) Successful in 11m4s

This commit is contained in:
2025-09-06 19:24:24 -04:00
parent b538dd8012
commit 4ae1622f02
8 changed files with 1004 additions and 50 deletions

208
src/parser_storage_e0.rs Normal file
View File

@@ -0,0 +1,208 @@
use crate::db::ParsedStorage;
use lazy_static::lazy_static;
use regex::*;
// let parsed_size_info = crate::parser_storage::parse_size_and_quantity(&cleaned_title);
// let _cost_per_tb = if let Some(price) = primary_display_price {
// if parsed_size_info.total_tb > 0.0 {
// Some(((price / parsed_size_info.total_tb) * 100.0).round() / 100.0)
// } else {
// None
// }
// } else {
// None
// };
lazy_static! {
static ref EXPLICIT_QTY_PATTERNS: Vec<Regex> = vec![
Regex::new(r"\b(?:LOT\s+OF|LOT)\s*\(?\s*(\d+)\s*\)?").unwrap(),
Regex::new(r"\b(?:LOT\s+OF|LOT)\s*\*\s*(\d+)").unwrap(),
Regex::new(r"\b(?:PACK\s+OF|PACK|BULK)\s*\(?\s*(\d+)\s*\)?").unwrap(),
Regex::new(r"\b(\d+)\s*-\s*PACK\b").unwrap(),
Regex::new(r"\b(\d+)\s*COUNT\b").unwrap(),
];
static ref SIZE_REGEX: Regex = Regex::new(r"(\d+(?:\.\d+)?)\s*(TB|GB)\b").unwrap();
static ref SIZE_RANGE_REGEX: Regex =
Regex::new(r"\d+(?:\.\d+)?\s*(?:GB|TB)\s*(?:-|&|OR|TO)\s*\d+(?:\.\d+)?\s*(?:GB|TB)")
.unwrap();
}
/// Parses size and quantity information from an item title.
pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage {
let upper_title = title.to_uppercase();
let mut total_gb = 0i64;
let mut quantity = 1i64;
let mut needed_description_check = false;
let mut individual_size_gb = 0i64;
for pattern in EXPLICIT_QTY_PATTERNS.iter() {
if let Some(caps) = pattern.captures(&upper_title) {
if let Some(qty_match) = caps.get(1) {
if let Ok(parsed_qty) = qty_match.as_str().parse::<i64>() {
if parsed_qty > 0 && parsed_qty < 500 {
quantity = parsed_qty;
break;
}
}
}
}
}
let mut size_matches: Vec<(f64, String)> = Vec::new();
for caps in SIZE_REGEX.captures_iter(&upper_title) {
if let (Some(val_str), Some(unit_str)) = (caps.get(1), caps.get(2)) {
if let Ok(val) = val_str.as_str().parse::<f64>() {
size_matches.push((val, unit_str.as_str().to_string()));
}
}
}
if !size_matches.is_empty() {
let mut unique_sizes_gb: Vec<i64> = size_matches
.iter()
.map(|(val, unit)| { if unit == "TB" { *val * 1024.0 } else { *val } } as i64)
.collect();
unique_sizes_gb.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
unique_sizes_gb.dedup();
if !unique_sizes_gb.is_empty() {
individual_size_gb = unique_sizes_gb[0];
if unique_sizes_gb.len() > 1 {
needed_description_check = true;
}
}
}
if SIZE_RANGE_REGEX.is_match(&upper_title) {
needed_description_check = true;
}
if quantity > 1 && upper_title.contains("MIXED") {
needed_description_check = true;
}
if upper_title.contains("CHECK THE DESCRIPTION")
|| upper_title.contains("CHECK DESCRIPTION")
|| upper_title.contains("SEE DESCRIPTION")
{
if quantity > 1 || size_matches.is_empty() || size_matches.len() > 1 {
needed_description_check = true;
}
}
if individual_size_gb > 0 {
total_gb = individual_size_gb * quantity;
}
if quantity > 1 && total_gb == 0 && !size_matches.is_empty() {
needed_description_check = true;
}
if quantity == 1 && size_matches.len() == 1 && !needed_description_check {
// This condition is implicitly handled
}
ParsedStorage {
id: 0,
item: item_id,
total_gigabytes: total_gb,
quantity,
individual_size_gigabytes: individual_size_gb,
needed_description_check,
parse_engine: 0,
}
}
#[cfg(test)]
mod tests {
use super::*;
use similar_asserts::assert_eq;
static TESTS: &[(&str, ParsedStorage, bool)] = &[
(
"Lot Of 3 Western Digital PC SN740 512GB M.2 2230 NVMe Internal SSD",
ParsedStorage {
id: 0,
item: 0,
total_gigabytes: 512 * 3,
quantity: 3,
individual_size_gigabytes: 512,
parse_engine: 0,
needed_description_check: false,
},
true,
),
(
"Samsung SM883 2.5” 240GB SATA 6Gbps MZ7KH240HAHQ-00005",
ParsedStorage {
id: 0,
item: 1,
total_gigabytes: 240,
quantity: 1,
individual_size_gigabytes: 240,
parse_engine: 0,
needed_description_check: false,
},
true,
),
(
"1TB AData SU650 2.5-inch SATA 6Gb/s SSD Solid State Disk (READ)",
ParsedStorage {
id: 0,
item: 2,
total_gigabytes: 1024,
quantity: 1,
individual_size_gigabytes: 1024,
parse_engine: 0,
needed_description_check: true,
},
false, // Sadly this one fails :/
),
(
"Hitachi VSP 7TB Flash Module Drive (FMD) 3286734-A DKC-F810I-7R0FP",
ParsedStorage {
id: 0,
item: 4,
total_gigabytes: 7 * 1024,
quantity: 1,
individual_size_gigabytes: 7 * 1024,
parse_engine: 0,
needed_description_check: false,
},
true,
),
(
"(Lot of 6) Samsung MZ-VLB2560 256GB M.2 NVMe Internal SSD (MZVLB256HBHQ-000H1)",
ParsedStorage {
id: 0,
item: 5,
total_gigabytes: 6 * 256,
quantity: 6,
individual_size_gigabytes: 256,
parse_engine: 0,
needed_description_check: false,
},
true,
),
(
"Lenovo-Micron 5300 Pro 1.92TB 2.5\" Sata SSD 02JG540 (MTFDDAK1T9TDS)",
ParsedStorage {
id: 0,
item: 6,
total_gigabytes: 1966,
quantity: 1,
individual_size_gigabytes: 1966,
parse_engine: 0,
needed_description_check: false,
},
true,
),
];
#[test_log::test]
fn parse() {
for t in TESTS {
if t.2 {
assert_eq!(t.1, parse_size_and_quantity(t.1.item, t.0));
}
}
}
}