Add LLM based parsing
All checks were successful
Cargo Build & Test / Rust project - latest (1.86) (push) Successful in 3m49s
Cargo Build & Test / Rust project - latest (1.87) (push) Successful in 4m2s
Cargo Build & Test / Rust project - latest (1.88) (push) Successful in 4m17s
Cargo Build & Test / Rust project - latest (1.85.1) (push) Successful in 9m36s
All checks were successful
Cargo Build & Test / Rust project - latest (1.86) (push) Successful in 3m49s
Cargo Build & Test / Rust project - latest (1.87) (push) Successful in 4m2s
Cargo Build & Test / Rust project - latest (1.88) (push) Successful in 4m17s
Cargo Build & Test / Rust project - latest (1.85.1) (push) Successful in 9m36s
This commit is contained in:
110
readme.md
110
readme.md
@@ -10,3 +10,113 @@ echo run2 && http GET "$URL_BASE/listing/since/12345678/2" && \
|
|||||||
echo run3 && http GET "$URL_BASE/listing/388484391867" && \
|
echo run3 && http GET "$URL_BASE/listing/388484391867" && \
|
||||||
echo run4 && http GET "$URL_BASE/listing/286605201240/history"
|
echo run4 && http GET "$URL_BASE/listing/286605201240/history"
|
||||||
```
|
```
|
||||||
|
|
||||||
|
And some jq usage for raw interaction of the data;
|
||||||
|
```bash
|
||||||
|
# Download a bunch of listings.
|
||||||
|
http https://scraper.hak8or.com/api/listings since==0 limit==20 > listings.json
|
||||||
|
|
||||||
|
# Show what a single listing looks like.
|
||||||
|
listings.json | jq '.[0]'
|
||||||
|
{
|
||||||
|
"listing": {
|
||||||
|
"id": 22563,
|
||||||
|
"item_id": 286707621236,
|
||||||
|
"title": "WD_BLACK SN770M 2TB M.2 NVMe Internal SSD (WDBDNH0020BBK-WRSN)",
|
||||||
|
"buy_it_now_price_cents": null,
|
||||||
|
"has_best_offer": false,
|
||||||
|
"image_url": "https://i.ebayimg.com/images/g/It4AAeSwzz5oddoa/s-l140.jpg"
|
||||||
|
},
|
||||||
|
"history": [
|
||||||
|
{
|
||||||
|
"item": 286707621236,
|
||||||
|
"timestamp": "2025-07-15T04:46:54Z",
|
||||||
|
"category": "ssd",
|
||||||
|
"current_bid_usd_cents": 12900
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"parsed": [
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"item": 286707621236,
|
||||||
|
"total_gigabytes": 2048,
|
||||||
|
"quantity": 1,
|
||||||
|
"individual_size_gigabytes": 2048,
|
||||||
|
"parse_engine": 0,
|
||||||
|
"needed_description_check": false
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Show the 1st and 2nd items, but only grab a few specific entries.
|
||||||
|
cat listings_small.json | jq '[.[1:3][] | {
|
||||||
|
item_id: .listing.item_id,
|
||||||
|
title: .listing.title,
|
||||||
|
parsed: .parsed[] | {
|
||||||
|
total_gigabytes,
|
||||||
|
quantity,
|
||||||
|
individual_size_gigabytes
|
||||||
|
}
|
||||||
|
}]'
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"item_id": 297545995095,
|
||||||
|
"title": "Crucial P3 Plus 1TB NVMe Internal M.2 SSD (CT1000P3PSSD8) - Barely used!",
|
||||||
|
"parsed": {
|
||||||
|
"total_gigabytes": 1024,
|
||||||
|
"quantity": 1,
|
||||||
|
"individual_size_gigabytes": 1024
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"item_id": 127220979797,
|
||||||
|
"title": "Kingston NV2 2TB M.2 3500MG/S NVMe Internal SSD PCIe 4.0 Gen SNV2S/2000G C-/#qWT",
|
||||||
|
"parsed": {
|
||||||
|
"total_gigabytes": 2048,
|
||||||
|
"quantity": 1,
|
||||||
|
"individual_size_gigabytes": 2048
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
And now a LLM based parse, such that the prompt is this (189 Tokens for Gemini 2.5 Flash Lite)
|
||||||
|
```
|
||||||
|
I will provide you with a listing title I want you to analyse. Then you will tell me the total gigabytes of all drives listed in the listing, how many drives are specified in the title, and the gigabytes of each drive in the listing. Here is an example for a title of "Crucial P3 Plus 1TB NVMe Internal M.2 SSD (CT1000P3PSSD8) - Barely used!";
|
||||||
|
```
|
||||||
|
{
|
||||||
|
"total_gigabytes": 1024,
|
||||||
|
"quantity": 1,
|
||||||
|
"individual_size_gigabytes": 1024
|
||||||
|
}
|
||||||
|
```
|
||||||
|
Reply with "OK" (and _only_ "OK") if you understand this. After you reply with that, I will provide you with a title, and then you will reply with solely the requested json (and ONLY said json).
|
||||||
|
```
|
||||||
|
|
||||||
|
And passing a title of (30 tokens);
|
||||||
|
```
|
||||||
|
Lot Of 3 Western Digital PC SN740 512GB M.2 2230 NVMe Internal SSD
|
||||||
|
```
|
||||||
|
returns the following json of (41 tokens);
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"total_gigabytes": 1536,
|
||||||
|
"quantity": 3,
|
||||||
|
"individual_size_gigabytes": 512
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
and another example of sending (49 tokens)
|
||||||
|
```
|
||||||
|
(Lot of 6) Samsung MZ-VLB2560 256GB M.2 NVMe Internal SSD (MZVLB256HBHQ-000H1)
|
||||||
|
```
|
||||||
|
returns the following json of (42 tokens);
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"total_gigabytes": 1536,
|
||||||
|
"quantity": 6,
|
||||||
|
"individual_size_gigabytes": 256
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
So for 1 listing we have a 189 Token "System Prompt", then a ~45 token title prompt, and 42 Token parsed reply. Given 30,000 listings, that's 5,670,000 Token "System Prompt" as Input, 1,350,000 Token Title prompt as Input, and 1,260,000 Token Parsed information (output). Assuming Gemini 2.5 Flash Mini which is $0.10/M for input and $0.40/M for output, would pay $0.702 for input and $0.504 for output, or $1.206 total.
|
||||||
|
@@ -1,4 +1,5 @@
|
|||||||
pub mod db;
|
pub mod db;
|
||||||
pub mod parser;
|
pub mod parser;
|
||||||
pub mod parser_ebay;
|
pub mod parser_ebay;
|
||||||
pub mod parser_storage;
|
pub mod parser_storage_e0;
|
||||||
|
pub mod parser_storage_e1;
|
||||||
|
@@ -6,7 +6,7 @@ use ebay_scraper_rust::db::{
|
|||||||
get_stats, listings_get_filtered,
|
get_stats, listings_get_filtered,
|
||||||
};
|
};
|
||||||
use ebay_scraper_rust::parser::parse_dir;
|
use ebay_scraper_rust::parser::parse_dir;
|
||||||
use ebay_scraper_rust::parser_storage;
|
use ebay_scraper_rust::{parser_storage_e0, parser_storage_e1};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
use std::sync::Mutex;
|
use std::sync::Mutex;
|
||||||
@@ -102,7 +102,7 @@ async fn parse_listings(db: Data<Mutex<rusqlite::Connection>>) -> Result<impl Re
|
|||||||
let db_unlocked = db.lock().unwrap();
|
let db_unlocked = db.lock().unwrap();
|
||||||
Listing::lookup_non_parsed(&db_unlocked)
|
Listing::lookup_non_parsed(&db_unlocked)
|
||||||
.iter()
|
.iter()
|
||||||
.map(|l| parser_storage::parse_size_and_quantity(l.0, &l.1))
|
.map(|l| parser_storage_e0::parse_size_and_quantity(l.0, &l.1))
|
||||||
.inspect(|_| cnt = cnt + 1)
|
.inspect(|_| cnt = cnt + 1)
|
||||||
.for_each(|ps| ps.add_or_update(&db_unlocked));
|
.for_each(|ps| ps.add_or_update(&db_unlocked));
|
||||||
|
|
||||||
|
39
src/parser_storage_e1.rs
Normal file
39
src/parser_storage_e1.rs
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
use crate::db::ParsedStorage;
|
||||||
|
|
||||||
|
// Given this prompt and a string of "(Lot of 6) Samsung MZ-VLB2560 256GB M.2NVMe Internal SSD
|
||||||
|
// (MZVLB256HBHQ-000H1)", we get 338 input tokens and 36 output tokens. Assuming no caching, then
|
||||||
|
// Gemini 2.5 Flash Lite at $0.10/M input and $0.40/M output, this would cost $0.0000338 Input,
|
||||||
|
// $0.0000144 Output, and $0.0000482 Total. Given 30,000 listings this would be $1.446.
|
||||||
|
const SYSTEM_PROMPT: &str = r#"
|
||||||
|
You will be given a product listing for one or more storage drives. You will return *ONLY* JSON strictly adhering to the same structure and key names as below. You will specify how many storage drives are included in the listing as a number (1, 2, 3, etc), the size in gigabytes of each drive as a number (rounding up if needed, so 1, 2, 3, etc), and lastly if the above cannot be provided due the the listing title being incomplete or confusing, a very short reason why.
|
||||||
|
|
||||||
|
Here is an example for a title of "Lot of 2, Crucial P3 Plus 1TB NVMe Internal M.2 SSD (CT1000P3PSSD8) - Barely used!";
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"quantity": 2,
|
||||||
|
"gigabytes": 1024
|
||||||
|
"fail_reason": ""
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
And an example for an unclear title of "Pallet of assorted 128GB to 5TB drives";
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"quantity": 0,
|
||||||
|
"gigabytes": 0,
|
||||||
|
"fail_reason": "multiple mixed sizes"
|
||||||
|
}
|
||||||
|
"#;
|
||||||
|
|
||||||
|
/// Parses size and quantity information from an item title.
|
||||||
|
pub fn parse_size_and_quantity(item_id: i64, title: &str) -> ParsedStorage {
|
||||||
|
ParsedStorage {
|
||||||
|
id: 0,
|
||||||
|
item: item_id,
|
||||||
|
total_gigabytes: 0,
|
||||||
|
quantity: 0,
|
||||||
|
individual_size_gigabytes: 0,
|
||||||
|
needed_description_check: false,
|
||||||
|
parse_engine: 0,
|
||||||
|
}
|
||||||
|
}
|
Reference in New Issue
Block a user