Add LLM based parsing
All checks were successful
Cargo Build & Test / Rust project - latest (1.88) (push) Successful in 4m11s
Cargo Build & Test / Rust project - latest (1.87) (push) Successful in 4m34s
Cargo Build & Test / Rust project - latest (1.86) (push) Successful in 5m14s
Cargo Build & Test / Rust project - latest (1.85.1) (push) Successful in 11m4s
All checks were successful
Cargo Build & Test / Rust project - latest (1.88) (push) Successful in 4m11s
Cargo Build & Test / Rust project - latest (1.87) (push) Successful in 4m34s
Cargo Build & Test / Rust project - latest (1.86) (push) Successful in 5m14s
Cargo Build & Test / Rust project - latest (1.85.1) (push) Successful in 11m4s
This commit is contained in:
208
src/parser_storage_e0.rs
Normal file
208
src/parser_storage_e0.rs
Normal file
@@ -0,0 +1,208 @@
|
||||
use crate::db::ParsedStorage;
|
||||
use lazy_static::lazy_static;
|
||||
use regex::*;
|
||||
|
||||
// let parsed_size_info = crate::parser_storage::parse_size_and_quantity(&cleaned_title);
|
||||
// let _cost_per_tb = if let Some(price) = primary_display_price {
|
||||
// if parsed_size_info.total_tb > 0.0 {
|
||||
// Some(((price / parsed_size_info.total_tb) * 100.0).round() / 100.0)
|
||||
// } else {
|
||||
// None
|
||||
// }
|
||||
// } else {
|
||||
// None
|
||||
// };
|
||||
|
||||
lazy_static! {
|
||||
static ref EXPLICIT_QTY_PATTERNS: Vec<Regex> = vec![
|
||||
Regex::new(r"\b(?:LOT\s+OF|LOT)\s*\(?\s*(\d+)\s*\)?").unwrap(),
|
||||
Regex::new(r"\b(?:LOT\s+OF|LOT)\s*\*\s*(\d+)").unwrap(),
|
||||
Regex::new(r"\b(?:PACK\s+OF|PACK|BULK)\s*\(?\s*(\d+)\s*\)?").unwrap(),
|
||||
Regex::new(r"\b(\d+)\s*-\s*PACK\b").unwrap(),
|
||||
Regex::new(r"\b(\d+)\s*COUNT\b").unwrap(),
|
||||
];
|
||||
static ref SIZE_REGEX: Regex = Regex::new(r"(\d+(?:\.\d+)?)\s*(TB|GB)\b").unwrap();
|
||||
static ref SIZE_RANGE_REGEX: Regex =
|
||||
Regex::new(r"\d+(?:\.\d+)?\s*(?:GB|TB)\s*(?:-|&|OR|TO)\s*\d+(?:\.\d+)?\s*(?:GB|TB)")
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
/// Parses size and quantity information from an item title.
|
||||
pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage {
|
||||
let upper_title = title.to_uppercase();
|
||||
let mut total_gb = 0i64;
|
||||
let mut quantity = 1i64;
|
||||
let mut needed_description_check = false;
|
||||
let mut individual_size_gb = 0i64;
|
||||
|
||||
for pattern in EXPLICIT_QTY_PATTERNS.iter() {
|
||||
if let Some(caps) = pattern.captures(&upper_title) {
|
||||
if let Some(qty_match) = caps.get(1) {
|
||||
if let Ok(parsed_qty) = qty_match.as_str().parse::<i64>() {
|
||||
if parsed_qty > 0 && parsed_qty < 500 {
|
||||
quantity = parsed_qty;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut size_matches: Vec<(f64, String)> = Vec::new();
|
||||
for caps in SIZE_REGEX.captures_iter(&upper_title) {
|
||||
if let (Some(val_str), Some(unit_str)) = (caps.get(1), caps.get(2)) {
|
||||
if let Ok(val) = val_str.as_str().parse::<f64>() {
|
||||
size_matches.push((val, unit_str.as_str().to_string()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !size_matches.is_empty() {
|
||||
let mut unique_sizes_gb: Vec<i64> = size_matches
|
||||
.iter()
|
||||
.map(|(val, unit)| { if unit == "TB" { *val * 1024.0 } else { *val } } as i64)
|
||||
.collect();
|
||||
unique_sizes_gb.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
|
||||
unique_sizes_gb.dedup();
|
||||
|
||||
if !unique_sizes_gb.is_empty() {
|
||||
individual_size_gb = unique_sizes_gb[0];
|
||||
if unique_sizes_gb.len() > 1 {
|
||||
needed_description_check = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if SIZE_RANGE_REGEX.is_match(&upper_title) {
|
||||
needed_description_check = true;
|
||||
}
|
||||
if quantity > 1 && upper_title.contains("MIXED") {
|
||||
needed_description_check = true;
|
||||
}
|
||||
if upper_title.contains("CHECK THE DESCRIPTION")
|
||||
|| upper_title.contains("CHECK DESCRIPTION")
|
||||
|| upper_title.contains("SEE DESCRIPTION")
|
||||
{
|
||||
if quantity > 1 || size_matches.is_empty() || size_matches.len() > 1 {
|
||||
needed_description_check = true;
|
||||
}
|
||||
}
|
||||
|
||||
if individual_size_gb > 0 {
|
||||
total_gb = individual_size_gb * quantity;
|
||||
}
|
||||
|
||||
if quantity > 1 && total_gb == 0 && !size_matches.is_empty() {
|
||||
needed_description_check = true;
|
||||
}
|
||||
|
||||
if quantity == 1 && size_matches.len() == 1 && !needed_description_check {
|
||||
// This condition is implicitly handled
|
||||
}
|
||||
|
||||
ParsedStorage {
|
||||
id: 0,
|
||||
item: item_id,
|
||||
total_gigabytes: total_gb,
|
||||
quantity,
|
||||
individual_size_gigabytes: individual_size_gb,
|
||||
needed_description_check,
|
||||
parse_engine: 0,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use similar_asserts::assert_eq;
|
||||
|
||||
static TESTS: &[(&str, ParsedStorage, bool)] = &[
|
||||
(
|
||||
"Lot Of 3 Western Digital PC SN740 512GB M.2 2230 NVMe Internal SSD",
|
||||
ParsedStorage {
|
||||
id: 0,
|
||||
item: 0,
|
||||
total_gigabytes: 512 * 3,
|
||||
quantity: 3,
|
||||
individual_size_gigabytes: 512,
|
||||
parse_engine: 0,
|
||||
needed_description_check: false,
|
||||
},
|
||||
true,
|
||||
),
|
||||
(
|
||||
"Samsung SM883 2.5†240GB SATA 6Gbps MZ7KH240HAHQ-00005",
|
||||
ParsedStorage {
|
||||
id: 0,
|
||||
item: 1,
|
||||
total_gigabytes: 240,
|
||||
quantity: 1,
|
||||
individual_size_gigabytes: 240,
|
||||
parse_engine: 0,
|
||||
needed_description_check: false,
|
||||
},
|
||||
true,
|
||||
),
|
||||
(
|
||||
"1TB AData SU650 2.5-inch SATA 6Gb/s SSD Solid State Disk (READ)",
|
||||
ParsedStorage {
|
||||
id: 0,
|
||||
item: 2,
|
||||
total_gigabytes: 1024,
|
||||
quantity: 1,
|
||||
individual_size_gigabytes: 1024,
|
||||
parse_engine: 0,
|
||||
needed_description_check: true,
|
||||
},
|
||||
false, // Sadly this one fails :/
|
||||
),
|
||||
(
|
||||
"Hitachi VSP 7TB Flash Module Drive (FMD) 3286734-A DKC-F810I-7R0FP",
|
||||
ParsedStorage {
|
||||
id: 0,
|
||||
item: 4,
|
||||
total_gigabytes: 7 * 1024,
|
||||
quantity: 1,
|
||||
individual_size_gigabytes: 7 * 1024,
|
||||
parse_engine: 0,
|
||||
needed_description_check: false,
|
||||
},
|
||||
true,
|
||||
),
|
||||
(
|
||||
"(Lot of 6) Samsung MZ-VLB2560 256GB M.2 NVMe Internal SSD (MZVLB256HBHQ-000H1)",
|
||||
ParsedStorage {
|
||||
id: 0,
|
||||
item: 5,
|
||||
total_gigabytes: 6 * 256,
|
||||
quantity: 6,
|
||||
individual_size_gigabytes: 256,
|
||||
parse_engine: 0,
|
||||
needed_description_check: false,
|
||||
},
|
||||
true,
|
||||
),
|
||||
(
|
||||
"Lenovo-Micron 5300 Pro 1.92TB 2.5\" Sata SSD 02JG540 (MTFDDAK1T9TDS)",
|
||||
ParsedStorage {
|
||||
id: 0,
|
||||
item: 6,
|
||||
total_gigabytes: 1966,
|
||||
quantity: 1,
|
||||
individual_size_gigabytes: 1966,
|
||||
parse_engine: 0,
|
||||
needed_description_check: false,
|
||||
},
|
||||
true,
|
||||
),
|
||||
];
|
||||
|
||||
#[test_log::test]
|
||||
fn parse() {
|
||||
for t in TESTS {
|
||||
if t.2 {
|
||||
assert_eq!(t.1, parse_size_and_quantity(t.1.item, t.0));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user