Add LLM based parsing

2025-09-06 19:24:24 -04:00
parent b538dd8012
commit 373bc6e050
5 changed files with 153 additions and 3 deletions
--- a/readme.md
+++ b/readme.md
@@ -10,3 +10,113 @@ echo run2 && http GET "$URL_BASE/listing/since/12345678/2" && \
 echo run3 && http GET "$URL_BASE/listing/388484391867" && \
 echo run4 && http GET "$URL_BASE/listing/286605201240/history"
 ```
 And some jq usage for raw interaction of the data;
 ```bash
 # Download a bunch of listings.
 http https://scraper.hak8or.com/api/listings since==0 limit==20 > listings.json
 # Show what a single listing looks like.
 listings.json | jq '.[0]'
 {
  "listing": {
    "id": 22563,
    "item_id": 286707621236,
    "title": "WD_BLACK SN770M 2TB M.2 NVMe Internal SSD (WDBDNH0020BBK-WRSN)",
    "buy_it_now_price_cents": null,
    "has_best_offer": false,
    "image_url": "https://i.ebayimg.com/images/g/It4AAeSwzz5oddoa/s-l140.jpg"
  },
  "history": [
    {
      "item": 286707621236,
      "timestamp": "2025-07-15T04:46:54Z",
      "category": "ssd",
      "current_bid_usd_cents": 12900
    }
  ],
  "parsed": [
    {
      "id": 6,
      "item": 286707621236,
      "total_gigabytes": 2048,
      "quantity": 1,
      "individual_size_gigabytes": 2048,
      "parse_engine": 0,
      "needed_description_check": false
    }
  ]
 }
 # Show the 1st and 2nd items, but only grab a few specific entries.
 cat listings_small.json | jq '[.[1:3][] | {
    item_id: .listing.item_id,
    title: .listing.title,
    parsed: .parsed[] | {
      total_gigabytes,
      quantity,
      individual_size_gigabytes
    }
  }]'
 [
  {
    "item_id": 297545995095,
    "title": "Crucial P3 Plus 1TB NVMe Internal M.2 SSD (CT1000P3PSSD8) - Barely used!",
    "parsed": {
      "total_gigabytes": 1024,
      "quantity": 1,
      "individual_size_gigabytes": 1024
    }
  },
  {
    "item_id": 127220979797,
    "title": "Kingston NV2 2TB M.2 3500MG/S NVMe Internal SSD PCIe 4.0 Gen SNV2S/2000G C-/#qWT",
    "parsed": {
      "total_gigabytes": 2048,
      "quantity": 1,
      "individual_size_gigabytes": 2048
    }
  }
 ]
 ```
 And now a LLM based parse, such that the prompt is this (189 Tokens for Gemini 2.5 Flash Lite)
 ```
 I will provide you with a listing title I want you to analyse. Then you will tell me the total gigabytes of all drives listed in the listing, how many drives are specified in the title, and the gigabytes of each drive in the listing. Here is an example for a title of "Crucial P3 Plus 1TB NVMe Internal M.2 SSD (CT1000P3PSSD8) - Barely used!";
 ```
 {
  "total_gigabytes": 1024,
  "quantity": 1,
  "individual_size_gigabytes": 1024
 }
 ```
 Reply with "OK" (and _only_ "OK") if you understand this. After you reply with that, I will provide you with a title, and then you will reply with solely the requested json (and ONLY said json).
 ```
 And passing a title of (30 tokens);
 ```
 Lot Of 3 Western Digital PC SN740 512GB M.2 2230 NVMe Internal SSD
 ```
 returns the following json of (41 tokens);
 ```json
 {
  "total_gigabytes": 1536,
  "quantity": 3,
  "individual_size_gigabytes": 512
 }
 ```
 and another example of sending (49 tokens)
 ```
 (Lot of 6) Samsung MZ-VLB2560 256GB M.2 NVMe Internal SSD (MZVLB256HBHQ-000H1)
 ```
 returns the following json of (42 tokens);
 ```json
 {
  "total_gigabytes": 1536,
  "quantity": 6,
  "individual_size_gigabytes": 256
 }
 ```
 So for 1 listing we have a 189 Token "System Prompt", then a ~45 token title prompt, and 42 Token parsed reply. Given 30,000 listings, that's 5,670,000 Token "System Prompt" as Input, 1,350,000 Token Title prompt as Input, and 1,260,000 Token Parsed information (output). Assuming Gemini 2.5 Flash Mini which is $0.10/M for input and $0.40/M for output, would pay $0.702 for input and $0.504 for output, or $1.206 total.
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -1,4 +1,5 @@
 pub mod db;
 pub mod parser;
 pub mod parser_ebay;
-pub mod parser_storage;
+pub mod parser_storage_e0;
 pub mod parser_storage_e1;
--- a/src/main.rs
+++ b/src/main.rs
@@ -6,7 +6,7 @@ use ebay_scraper_rust::db::{
    get_stats, listings_get_filtered,
 };
 use ebay_scraper_rust::parser::parse_dir;
-use ebay_scraper_rust::parser_storage;
+use ebay_scraper_rust::{parser_storage_e0, parser_storage_e1};
 use serde::{Deserialize, Serialize};
 use std::path::PathBuf;
 use std::sync::Mutex;
@@ -102,7 +102,7 @@ async fn parse_listings(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Re
    let db_unlocked = db.lock().unwrap();
    Listing::lookup_non_parsed(&db_unlocked)
        .iter()
-        .map(|l| parser_storage::parse_size_and_quantity(l.0, &l.1))
+        .map(|l| parser_storage_e0::parse_size_and_quantity(l.0, &l.1))
        .inspect(|_| cnt = cnt + 1)
        .for_each(|ps| ps.add_or_update(&db_unlocked));
--- a/src/parser_storage_e0.rs
+++ b/src/parser_storage_e0.rs
--- a/src/parser_storage_e1.rs
+++ b/src/parser_storage_e1.rs
@@ -0,0 +1,39 @@
 use crate::db::ParsedStorage;
 // Given this prompt and a string of "(Lot of 6) Samsung MZ-VLB2560 256GB M.2NVMe Internal SSD
 // (MZVLB256HBHQ-000H1)", we get 338 input tokens and 36 output tokens. Assuming no caching, then
 // Gemini 2.5 Flash Lite at $0.10/M input and $0.40/M output, this would cost $0.0000338 Input,
 // $0.0000144 Output, and $0.0000482 Total. Given 30,000 listings this would be $1.446.
 const SYSTEM_PROMPT: &str = r#"
 You will be given a product listing for one or more storage drives. You will return *ONLY* JSON strictly adhering to the same structure and key names as below. You will specify how many storage drives are included in the listing as a number (1, 2, 3, etc), the size in gigabytes of each drive as a number (rounding up if needed, so 1, 2, 3, etc), and lastly if the above cannot be provided due the the listing title being incomplete or confusing, a very short reason why.
 Here is an example for a title of "Lot of 2, Crucial P3 Plus 1TB NVMe Internal M.2 SSD (CT1000P3PSSD8) - Barely used!";
 ```json
 {
  "quantity": 2,
  "gigabytes": 1024
  "fail_reason": ""
 }
 ```
 And an example for an unclear title of "Pallet of assorted 128GB to 5TB drives";
 ```json
 {
  "quantity": 0,
  "gigabytes": 0,
  "fail_reason": "multiple mixed sizes"
 }
 "#;
 /// Parses size and quantity information from an item title.
 pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage {
    ParsedStorage {
        id: 0,
        item: item_id,
        total_gigabytes: 0,
        quantity: 0,
        individual_size_gigabytes: 0,
        needed_description_check: false,
        parse_engine: 0,
    }
 }