More CLI support

This commit is contained in:
2025-05-28 00:34:55 -04:00
parent 411e21ca7a
commit 8cb0fd12fb
4 changed files with 253 additions and 86 deletions

View File

@ -1,91 +1,251 @@
// ebay_command_line_tool.js
// A Node.js script to scrape eBay search results and output JSON.
// Uses ebay_core.js for parsing and extraction logic.
// Usage: node ebay_command_line_tool.js "EBAY_SEARCH_URL"
// ebay_command_line_tool.js V3
// Node.js script with commands to scrape eBay and output JSON.
// Enhanced with network blocking for --load, --only_json flag, and improved help.
// Usage: node ebay_command_line_tool.js [command] [options] [url]
const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');
const { Command } = require('commander');
// --- Main Scraping Function ---
async function scrapeEbayFromCommandLine(url) {
if (!url) {
console.error("Error: eBay search URL is required as the first argument.");
console.log("Example Usage: node ebay_command_line_tool.js \"https://www.ebay.com/sch/i.html?_nkw=ssd\"");
process.exit(1);
// --- Load Core Script ---
const coreScriptPath = path.join(__dirname, 'ebay_core.js'); // Assumes ebay_core.js is in the same directory
let ebayCoreScriptContent;
try {
ebayCoreScriptContent = fs.readFileSync(coreScriptPath, 'utf8');
if (!ebayCoreScriptContent) {
throw new Error("ebay_core.js is empty or could not be read properly.");
}
} catch (e) {
// This initial error should always print, regardless of --only_json
console.error(`Critical Error: Could not read ebay_core.js from ${coreScriptPath}`);
console.error("Please ensure 'ebay_core.js' exists in the same directory as this script.");
console.error(e.message);
process.exit(1);
}
// --- Load Core Script ---
// This assumes ebay_core.js is in the same directory as this script.
const coreScriptPath = path.join(__dirname, 'ebay_core.js');
let ebayCoreScriptContent;
try {
ebayCoreScriptContent = fs.readFileSync(coreScriptPath, 'utf8');
if (!ebayCoreScriptContent) {
throw new Error("ebay_core.js is empty or could not be read properly.");
}
} catch (e) {
console.error(`Error: Could not read ebay_core.js from ${coreScriptPath}`);
console.error("Please ensure 'ebay_core.js' exists in the same directory as this script.");
console.error(e.message);
process.exit(1);
// --- Global State for --only_json ---
let quietMode = false;
// --- Logger functions that respect quietMode ---
function logMessage(message) {
if (!quietMode) {
console.log(message);
}
}
function logError(message) {
if (!quietMode) {
console.error(message);
}
}
console.log(`Attempting to scrape: ${url}`);
let browser; // Declare browser outside try so it can be closed in finally
// --- Main Scraping Function (Updated) ---
async function scrapeEbay({ url = null, htmlFile = null, saveFile = null }) {
logMessage("Starting scraping process...");
let browser;
try {
browser = await puppeteer.launch({
headless: true, // Set to false for debugging to see the browser
args: ['--no-sandbox', '--disable-setuid-sandbox'] // Common args for server environments
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36');
// Increase navigation timeout and wait until network is idle
await page.goto(url, { waitUntil: 'networkidle2', timeout: 90000 });
let htmlContentToParse;
if (htmlFile) {
// --- Load from File with Network Blocking ---
logMessage(`Loading HTML from ${htmlFile}...`);
htmlContentToParse = fs.readFileSync(htmlFile, 'utf8');
logMessage("Enabling request interception to block network calls...");
await page.setRequestInterception(true);
page.on('request', (request) => {
// Abort all types of requests
logMessage(`Blocking request to: ${request.url()}`);
request.abort();
});
await page.setContent(htmlContentToParse, { waitUntil: 'domcontentloaded' });
logMessage("HTML loaded and set. Network requests are being blocked.");
} else if (url) {
// --- Fetch from URL ---
logMessage(`Navigating to ${url}...`);
await page.goto(url, { waitUntil: 'networkidle2', timeout: 90000 });
logMessage("Navigation successful.");
htmlContentToParse = await page.content();
logMessage("Page content retrieved.");
// --- Save to File (if requested) ---
if (saveFile && htmlContentToParse) {
logMessage(`Saving HTML to ${saveFile}...`);
fs.writeFileSync(saveFile, htmlContentToParse, 'utf8');
logMessage("HTML saved.");
}
} else {
// This error should be caught and handled by the caller or a try-catch within runScraping
throw new Error("Internal Error: Neither URL nor HTML file was provided to scrapeEbay.");
}
// --- Inject and Execute Core Logic ---
// Inject the core parser script into the page context
// This makes the EbayParser object available in the page's window scope
logMessage("Injecting core parser script...");
await page.evaluate(ebayCoreScriptContent);
logMessage("Core script injected. Extracting data...");
// Now call the extraction function from the injected script
const extractedResults = await page.evaluate(() => {
// EbayParser should now be available on the window object
if (typeof window.EbayParser === 'undefined' || typeof window.EbayParser.extractDataFromPage !== 'function') {
// This error will be caught by the outer try/catch if thrown
throw new Error("EbayParser or EbayParser.extractDataFromPage function was not properly injected or is missing in ebay_core.js!");
}
return window.EbayParser.extractDataFromPage(); // This calls the function defined in ebay_core.js
return window.EbayParser.extractDataFromPage();
});
logMessage(`Data extraction complete. Found ${extractedResults.length} items.`);
return extractedResults;
} catch (e) {
console.error("An error occurred during the scraping process:", e.message);
// If running in a visible mode, a screenshot can be helpful.
// if (browser && page) { // Check if page exists
// try {
// await page.screenshot({ path: 'ebay_scraping_error.png' });
// console.log("A screenshot 'ebay_scraping_error.png' has been saved for debugging.");
// } catch(se) { console.error("Could not save screenshot:", se.message); }
// }
logError(`An error occurred during the scraping process: ${e.message}`);
// For debugging, you might want to see the stack trace even in quiet mode for critical errors
if (!quietMode && e.stack) {
console.error(e.stack);
}
return []; // Return empty array on error
} finally {
if (browser) {
await browser.close();
logMessage("Browser closed.");
}
}
}
// --- Script Execution ---
// The first actual argument to the script (process.argv[0] is node, process.argv[1] is the script path)
const searchUrl = process.argv[2];
// --- Setup Command Line Interface ---
const program = new Command();
program
.name('ebay-scraper')
.description('Scrapes eBay search results for SSD/HDD cost per TB.')
.version('3.0.0')
.option('--save <filename>', 'Save the scraped HTML to a file.')
.option('--load <filename>', 'Load HTML from a file instead of fetching from eBay (disables network).')
.option('--only_json', 'Suppress all informational logs and output only the final JSON.', false)
.on('option:only_json', () => {
quietMode = true;
});
program
.command('latest') // Removed { isDefault: false } as it's not strictly needed with argument handling
.description('Scrapes the latest listings using a predefined search. Use "ebay-scraper latest --help" to see specific options for this command.')
.option('--per_page <number>', 'Items per page (60, 120, or 240)', '60')
.option('--minimum_cost <number>', 'Minimum cost for listings (e.g., 50.00)', '0.00')
.action(async (cmdOptions) => { // cmdOptions refers to 'latest' command's options
const globalOptions = program.opts(); // Access global options like --save, --load, --only_json
if (globalOptions.only_json) quietMode = true; // Ensure quietMode is set if command is run directly
if (globalOptions.load) {
logMessage("Using --load, 'latest' command options for URL generation will be ignored.");
await runScraping({ htmlFile: globalOptions.load, saveFile: globalOptions.save });
} else {
const validPages = ['60', '120', '240'];
if (!validPages.includes(cmdOptions.per_page)) {
logError(`Error: --per_page must be one of ${validPages.join(', ')}.`);
if (!quietMode) process.exit(1); else throw new Error("Invalid per_page");
}
const minCost = parseFloat(cmdOptions.minimum_cost);
if (isNaN(minCost)) {
logError("Error: --minimum_cost must be a number.");
if (!quietMode) process.exit(1); else throw new Error("Invalid minimum_cost");
}
const baseUrl = 'https://www.ebay.com/sch/i.html?_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10';
const url = `${baseUrl}&_ipg=${cmdOptions.per_page}&_udlo=${minCost.toFixed(2)}`;
logMessage(`Constructed URL for 'latest': ${url}`);
await runScraping({ url: url, saveFile: globalOptions.save });
}
});
// Handle URL as an argument. This will act as the default action if no other command is matched.
program
.argument('[url]', 'The full eBay search URL to scrape.')
.action(async (url, cmdOptions) => { // cmdOptions here are the global ones if no command specified
const globalOptions = program.opts();
if (globalOptions.only_json) quietMode = true;
// If 'url' is undefined here, it means no command and no URL was provided.
// 'latest' command has its own action, so this won't run for 'latest'.
if (globalOptions.load) {
logMessage("Using --load, any provided URL argument will be ignored.");
await runScraping({ htmlFile: globalOptions.load, saveFile: globalOptions.save });
} else if (url) {
await runScraping({ url: url, saveFile: globalOptions.save });
} else {
// If no URL, no --load, and not the 'latest' command, show help.
// This condition means no specific action was determined.
if (!program.args.find(arg => program.commands.map(c => c.name()).includes(arg))) {
program.help();
}
}
});
// Add help text to guide users for subcommand help
program.addHelpText('after', `
Example calls:
$ ebay-scraper latest --per_page 120 --minimum_cost 50
$ ebay-scraper latest --help
$ ebay-scraper "https://www.ebay.com/sch/i.html?_nkw=ssd"
$ ebay-scraper --load saved_page.html --only_json | jq .
$ ebay-scraper --save current_page.html "https://www.ebay.com/sch/i.html?_nkw=hdd"`);
// --- Wrapper to run scraping and print results ---
async function runScraping(options) {
try {
const data = await scrapeEbay(options);
if (quietMode) {
// Only output JSON string, no extra newlines or messages
process.stdout.write(JSON.stringify(data, null, 2));
} else {
if (data && data.length > 0) {
console.log(JSON.stringify(data, null, 2));
} else {
logMessage("No data extracted or a critical error occurred during scraping.");
}
}
} catch (e) {
logError(`Critical error in runScraping: ${e.message}`);
if (!quietMode && e.stack) console.error(e.stack);
if (quietMode) { // Ensure valid JSON output even on error for piping
process.stdout.write(JSON.stringify({error: e.message, data: []}));
}
}
}
// --- Parse Arguments and Run ---
(async () => {
const data = await scrapeEbayFromCommandLine(searchUrl);
if (data && data.length > 0) {
console.log(JSON.stringify(data, null, 2));
} else {
console.log("No data extracted. This could be due to an error, an empty page, or incorrect selectors in ebay_core.js.");
try {
await program.parseAsync(process.argv);
// If no command was matched by commander and no URL argument was given,
// and it's not just options like --version or --help that commander handles.
const knownCommands = program.commands.map(cmd => cmd.name());
const userArgs = process.argv.slice(2);
const potentialCommand = userArgs.find(arg => !arg.startsWith('-'));
if (userArgs.length > 0 && !knownCommands.includes(potentialCommand) && !program.args.includes(potentialCommand) && !program.opts().load && potentialCommand) {
// This case handles if a user types something that isn't a command or a URL after options.
// Example: `node script.js --only_json somegibberish`
// However, the default argument [url] should catch most of these.
// If a URL-like string is passed, it will be caught by the .argument('[url]') action.
} else if (process.argv.slice(2).length === 0) { // No arguments at all
program.help();
}
} catch (error) {
logError(`Command parsing error: ${error.message}`);
if (!quietMode && error.stack) console.error(error.stack);
if (quietMode) {
process.stdout.write(JSON.stringify({error: error.message, data: []}));
} else {
process.exit(1);
}
}
})();

View File

@ -1,24 +1,22 @@
// ebay_core.js - Shared Parsing & Extraction Logic
// ebay_core.js V1.1 - Shared Parsing & Extraction Logic
// Added itemCount and sizePerItemTB to output.
(function (root, factory) {
if (typeof module === 'object' && module.exports) {
// Node.js. Does not work with strict CommonJS, but
// works in a Node environment for use with fs.readFileSync + injection.
module.exports = factory();
} else {
// Browser globals (Greasemonkey via @require)
root.EbayParser = factory();
}
}(typeof self !== 'undefined' ? self : this, function () {
'use strict';
const EbayParser = {}; // The object we will export/attach
const EbayParser = {};
EbayParser.parseSizeAndQuantity = function(title) {
title = title ? title.toUpperCase() : "";
let totalTB = 0;
let quantity = 1;
let needed_description_check = false;
let individualSizeTB = 0;
let individualSizeTB = 0; // Will hold the size per item
const explicitQtyPatterns = [
/\b(?:LOT\s+OF|LOT)\s*\(?\s*(\d+)\s*\)?/i,
@ -51,7 +49,7 @@
sizeMatches.map(sm => sm.unit === 'GB' ? sm.value / 1000 : sm.value)
)].sort((a, b) => a - b);
if (uniqueSizesTB.length > 0) {
individualSizeTB = uniqueSizesTB[0];
individualSizeTB = uniqueSizesTB[0]; // Set individual size
if (uniqueSizesTB.length > 1) needed_description_check = true;
}
}
@ -77,10 +75,15 @@
needed_description_check = false;
}
return { totalTB: parseFloat(totalTB.toFixed(4)), quantity, needed_description_check };
return {
totalTB: parseFloat(totalTB.toFixed(4)),
quantity: quantity, // Renamed to 'quantity' internally, maps to 'itemCount'
needed_description_check: needed_description_check,
individualSizeTB: parseFloat(individualSizeTB.toFixed(4)) // Added size per item
};
};
EbayParser.parsePrice = function(priceText) {
EbayParser.parsePrice = function(priceText) { /* ... (Keep existing parsePrice function) ... */
priceText = priceText || "";
if (priceText.toLowerCase().includes(' to ')) {
return null;
@ -92,26 +95,20 @@
return null;
};
EbayParser.runUnitTests = function() {
EbayParser.runUnitTests = function() { /* ... (Keep existing runUnitTests function) ... */
// Ensure console exists (for Node vs Browser safety, though Node has it)
const log = typeof console !== 'undefined' ? console.log : function() {};
const error = typeof console !== 'undefined' ? console.error : function() {};
log("Ebay Cost/TB: --- Running Unit Tests ---");
const testCases = [
{ title: "LOT OF (9) MAJOR BRAND 2.5\" 7MM SSD * Kingston, Samsung, SanDisk& PNY*120-250GB", expected: { totalTB: 1.080, quantity: 9, needed_description_check: true } },
{ title: "Lot of 10 Intel 256 GB 2.5\" SATA SSD different Model check the Description", expected: { totalTB: 2.560, quantity: 10, needed_description_check: true } },
{ title: "Lot of*10 Mixed brands 240GB-256GB 2.5\" SATA SSD Drives Working & tested", expected: { totalTB: 2.400, quantity: 10, needed_description_check: true } },
{ title: "Lot of 9 SSD 120&128 GB 2.5\" SATA different brands check the description", expected: { totalTB: 1.080, quantity: 9, needed_description_check: true } },
{ title: "Bulk 5 Lot Samsung 870 EVO 500GB SSD SATA - Used - Tested Passed Smart Test", expected: { totalTB: 2.500, quantity: 5, needed_description_check: false } },
{ title: "Samsung 1.6TB NVME PCIe 3.0 x8 2.75\" SSD MZPLK1T6HCHP PM1725 Series TLC", expected: { totalTB: 1.6, quantity: 1, needed_description_check: false } },
{ title: "Brand New Crucial X6 2TB Portable External SSD (CT2000X6SSD9)", expected: { totalTB: 2.0, quantity: 1, needed_description_check: false } },
{ title: "Western Digital WD_BLACK SN850X 2TB NVMe Internal SSD", expected: { totalTB: 2.0, quantity: 1, needed_description_check: false } },
{ title: "Corsair Force Series MP600 1TB Gen4 PCIe X4 NVMe M.2 SSD Up to 4950 MB/s CSSD...", expected: { totalTB: 1.0, quantity: 1, needed_description_check: false } },
{ title: "Micron 5100 MAX 1.84TB SATA 6Gb/s 2.5\" SSD MTFDDAK1T9TCC-1AR1ZABYY", expected: { totalTB: 1.84, quantity: 1, needed_description_check: false } },
{ title: "Dell 0HGX92 1.6TB 2.5” PCIe NVMe Gen4 SSD Intel D7-P5600 SSDPF2KE016T9T HGX92 ES", expected: { totalTB: 1.6, quantity: 1, needed_description_check: false } },
{ title: "10-PACK 1TB SSD", expected: { totalTB: 10.0, quantity: 10, needed_description_check: false } },
{ title: "LOT OF 2X 1TB SSDs", expected: { totalTB: 2.0, quantity: 2, needed_description_check: false } }
// Add expected individualSizeTB to tests
{ title: "LOT OF (9) MAJOR BRAND 2.5\" 7MM SSD * Kingston, Samsung, SanDisk& PNY*120-250GB", expected: { totalTB: 1.080, quantity: 9, individualSizeTB: 0.120, needed_description_check: true } },
{ title: "Lot of 10 Intel 256 GB 2.5\" SATA SSD different Model check the Description", expected: { totalTB: 2.560, quantity: 10, individualSizeTB: 0.256, needed_description_check: true } },
{ title: "Bulk 5 Lot Samsung 870 EVO 500GB SSD SATA - Used - Tested Passed Smart Test", expected: { totalTB: 2.500, quantity: 5, individualSizeTB: 0.500, needed_description_check: false } },
{ title: "Samsung 1.6TB NVME PCIe 3.0 x8 2.75\" SSD MZPLK1T6HCHP PM1725 Series TLC", expected: { totalTB: 1.6, quantity: 1, individualSizeTB: 1.6, needed_description_check: false } },
{ title: "Micron 5100 MAX 1.84TB SATA 6Gb/s 2.5\" SSD MTFDDAK1T9TCC-1AR1ZABYY", expected: { totalTB: 1.84, quantity: 1, individualSizeTB: 1.84, needed_description_check: false } },
{ title: "10-PACK 1TB SSD", expected: { totalTB: 10.0, quantity: 10, individualSizeTB: 1.0, needed_description_check: false } },
];
let testsPassed = 0;
@ -119,16 +116,17 @@
testCases.forEach((test, index) => {
const result = EbayParser.parseSizeAndQuantity(test.title);
const totalTBCheck = Math.abs(result.totalTB - test.expected.totalTB) < 0.0001;
const quantityCheck = result.quantity === test.expected.quantity;
const neededCheck = result.needed_description_check === test.expected.needed_description_check;
const tbCheck = Math.abs(result.totalTB - test.expected.totalTB) < 0.0001;
const qCheck = result.quantity === test.expected.quantity;
const sizeCheck = Math.abs(result.individualSizeTB - test.expected.individualSizeTB) < 0.0001;
const needCheck = result.needed_description_check === test.expected.needed_description_check;
if (totalTBCheck && quantityCheck && neededCheck) {
if (tbCheck && qCheck && sizeCheck && needCheck) {
testsPassed++;
} else {
error(`Test ${index + 1}: FAILED - "${test.title}"`);
error(` Expected: totalTB=${test.expected.totalTB.toFixed(4)}, Q=${test.expected.quantity}, Check=${test.expected.needed_description_check}`);
error(` Actual: totalTB=${result.totalTB.toFixed(4)}, Q=${result.quantity}, Check=${result.needed_description_check}`);
error(` Expected: TTB=${test.expected.totalTB.toFixed(4)}, Q=${test.expected.quantity}, STB=${test.expected.individualSizeTB.toFixed(4)}, Check=${test.expected.needed_description_check}`);
error(` Actual: TTB=${result.totalTB.toFixed(4)}, Q=${result.quantity}, STB=${result.individualSizeTB.toFixed(4)}, Check=${result.needed_description_check}`);
testsFailed++;
}
});
@ -137,7 +135,7 @@
return testsFailed === 0;
};
// This function is INTENDED TO RUN IN THE BROWSER via Puppeteer
// Updated to include itemCount and sizePerItemTB
EbayParser.extractDataFromPage = function() {
const itemSelector = 'li.s-item, li.srp-results__item, div.s-item[role="listitem"]';
const itemElements = document.querySelectorAll(itemSelector);
@ -155,10 +153,11 @@
if (!title || !priceText || !itemUrl) return;
// Use the parser functions (assuming 'EbayParser' is global/available)
const listingPrice = EbayParser.parsePrice(priceText);
const parsedInfo = EbayParser.parseSizeAndQuantity(title);
const totalTB = parsedInfo.totalTB;
const quantity = parsedInfo.quantity; // Get quantity
const individualSizeTB = parsedInfo.individualSizeTB; // Get individual size
const needed_description_check = parsedInfo.needed_description_check;
let costPerTB = null;
@ -177,6 +176,8 @@
itemId,
dateFound: today,
listingPrice,
itemCount: quantity, // <-- Added
sizePerItemTB: individualSizeTB > 0 ? parseFloat(individualSizeTB.toFixed(3)) : null, // <-- Added
totalTB: totalTB > 0 ? parseFloat(totalTB.toFixed(3)) : null,
costPerTB: costPerTB !== null ? parseFloat(costPerTB.toFixed(2)) : null,
needed_description_check,
@ -186,5 +187,5 @@
return items;
};
return EbayParser; // Return the object
return EbayParser;
}));

View File

@ -4,7 +4,8 @@
"main": "ebay_command_line_tool.js",
"license": "MIT",
"dependencies": {
"puppeteer": "^24.9.0"
"puppeteer": "^24.9.0",
"commander": "^14.0.0"
},
"scripts": {
"scrape": "node ebay_command_line_tool.js"

View File

@ -159,6 +159,11 @@ color-name@~1.1.4:
resolved "https://registry.yarnpkg.com/color-name/-/color-name-1.1.4.tgz#c2a09a87acbde69543de6f63fa3995c826c536a2"
integrity sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==
commander@^14.0.0:
version "14.0.0"
resolved "https://registry.yarnpkg.com/commander/-/commander-14.0.0.tgz#f244fc74a92343514e56229f16ef5c5e22ced5e9"
integrity sha512-2uM9rYjPvyq39NwLRqaiLtWHyDC1FvryJDa2ATTVims5YAS4PupsEQsDvP14FqhFr0P49CYDugi59xaxJlTXRA==
cosmiconfig@^9.0.0:
version "9.0.0"
resolved "https://registry.yarnpkg.com/cosmiconfig/-/cosmiconfig-9.0.0.tgz#34c3fc58287b915f3ae905ab6dc3de258b55ad9d"