252 lines
11 KiB
JavaScript
252 lines
11 KiB
JavaScript
// ebay_command_line_tool.js V3
|
|
// Node.js script with commands to scrape eBay and output JSON.
|
|
// Enhanced with network blocking for --load, --only_json flag, and improved help.
|
|
// Usage: node ebay_command_line_tool.js [command] [options] [url]
|
|
|
|
const puppeteer = require('puppeteer');
|
|
const fs = require('fs');
|
|
const path = require('path');
|
|
const { Command } = require('commander');
|
|
|
|
// --- Load Core Script ---
|
|
const coreScriptPath = path.join(__dirname, 'ebay_core.js'); // Assumes ebay_core.js is in the same directory
|
|
let ebayCoreScriptContent;
|
|
try {
|
|
ebayCoreScriptContent = fs.readFileSync(coreScriptPath, 'utf8');
|
|
if (!ebayCoreScriptContent) {
|
|
throw new Error("ebay_core.js is empty or could not be read properly.");
|
|
}
|
|
} catch (e) {
|
|
// This initial error should always print, regardless of --only_json
|
|
console.error(`Critical Error: Could not read ebay_core.js from ${coreScriptPath}`);
|
|
console.error("Please ensure 'ebay_core.js' exists in the same directory as this script.");
|
|
console.error(e.message);
|
|
process.exit(1);
|
|
}
|
|
|
|
// --- Global State for --only_json ---
|
|
let quietMode = false;
|
|
|
|
// --- Logger functions that respect quietMode ---
|
|
function logMessage(message) {
|
|
if (!quietMode) {
|
|
console.log(message);
|
|
}
|
|
}
|
|
function logError(message) {
|
|
if (!quietMode) {
|
|
console.error(message);
|
|
}
|
|
}
|
|
|
|
|
|
// --- Main Scraping Function (Updated) ---
|
|
async function scrapeEbay({ url = null, htmlFile = null, saveFile = null }) {
|
|
logMessage("Starting scraping process...");
|
|
|
|
let browser;
|
|
try {
|
|
browser = await puppeteer.launch({
|
|
headless: true,
|
|
args: ['--no-sandbox', '--disable-setuid-sandbox']
|
|
});
|
|
const page = await browser.newPage();
|
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36');
|
|
|
|
let htmlContentToParse;
|
|
|
|
if (htmlFile) {
|
|
// --- Load from File with Network Blocking ---
|
|
logMessage(`Loading HTML from ${htmlFile}...`);
|
|
htmlContentToParse = fs.readFileSync(htmlFile, 'utf8');
|
|
|
|
logMessage("Enabling request interception to block network calls...");
|
|
await page.setRequestInterception(true);
|
|
page.on('request', (request) => {
|
|
// Abort all types of requests
|
|
logMessage(`Blocking request to: ${request.url()}`);
|
|
request.abort();
|
|
});
|
|
|
|
await page.setContent(htmlContentToParse, { waitUntil: 'domcontentloaded' });
|
|
logMessage("HTML loaded and set. Network requests are being blocked.");
|
|
} else if (url) {
|
|
// --- Fetch from URL ---
|
|
logMessage(`Navigating to ${url}...`);
|
|
await page.goto(url, { waitUntil: 'networkidle2', timeout: 90000 });
|
|
logMessage("Navigation successful.");
|
|
htmlContentToParse = await page.content();
|
|
logMessage("Page content retrieved.");
|
|
|
|
// --- Save to File (if requested) ---
|
|
if (saveFile && htmlContentToParse) {
|
|
logMessage(`Saving HTML to ${saveFile}...`);
|
|
fs.writeFileSync(saveFile, htmlContentToParse, 'utf8');
|
|
logMessage("HTML saved.");
|
|
}
|
|
} else {
|
|
// This error should be caught and handled by the caller or a try-catch within runScraping
|
|
throw new Error("Internal Error: Neither URL nor HTML file was provided to scrapeEbay.");
|
|
}
|
|
|
|
// --- Inject and Execute Core Logic ---
|
|
logMessage("Injecting core parser script...");
|
|
await page.evaluate(ebayCoreScriptContent);
|
|
logMessage("Core script injected. Extracting data...");
|
|
|
|
const extractedResults = await page.evaluate(() => {
|
|
if (typeof window.EbayParser === 'undefined' || typeof window.EbayParser.extractDataFromPage !== 'function') {
|
|
throw new Error("EbayParser or EbayParser.extractDataFromPage function was not properly injected or is missing in ebay_core.js!");
|
|
}
|
|
return window.EbayParser.extractDataFromPage();
|
|
});
|
|
logMessage(`Data extraction complete. Found ${extractedResults.length} items.`);
|
|
|
|
return extractedResults;
|
|
|
|
} catch (e) {
|
|
logError(`An error occurred during the scraping process: ${e.message}`);
|
|
// For debugging, you might want to see the stack trace even in quiet mode for critical errors
|
|
if (!quietMode && e.stack) {
|
|
console.error(e.stack);
|
|
}
|
|
return []; // Return empty array on error
|
|
} finally {
|
|
if (browser) {
|
|
await browser.close();
|
|
logMessage("Browser closed.");
|
|
}
|
|
}
|
|
}
|
|
|
|
// --- Setup Command Line Interface ---
|
|
const program = new Command();
|
|
|
|
program
|
|
.name('ebay-scraper')
|
|
.description('Scrapes eBay search results for SSD/HDD cost per TB.')
|
|
.version('3.0.0')
|
|
.option('--save <filename>', 'Save the scraped HTML to a file.')
|
|
.option('--load <filename>', 'Load HTML from a file instead of fetching from eBay (disables network).')
|
|
.option('--only_json', 'Suppress all informational logs and output only the final JSON.', false)
|
|
.on('option:only_json', () => {
|
|
quietMode = true;
|
|
});
|
|
|
|
program
|
|
.command('latest') // Removed { isDefault: false } as it's not strictly needed with argument handling
|
|
.description('Scrapes the latest listings using a predefined search. Use "ebay-scraper latest --help" to see specific options for this command.')
|
|
.option('--per_page <number>', 'Items per page (60, 120, or 240)', '60')
|
|
.option('--minimum_cost <number>', 'Minimum cost for listings (e.g., 50.00)', '0.00')
|
|
.action(async (cmdOptions) => { // cmdOptions refers to 'latest' command's options
|
|
const globalOptions = program.opts(); // Access global options like --save, --load, --only_json
|
|
if (globalOptions.only_json) quietMode = true; // Ensure quietMode is set if command is run directly
|
|
|
|
if (globalOptions.load) {
|
|
logMessage("Using --load, 'latest' command options for URL generation will be ignored.");
|
|
await runScraping({ htmlFile: globalOptions.load, saveFile: globalOptions.save });
|
|
} else {
|
|
const validPages = ['60', '120', '240'];
|
|
if (!validPages.includes(cmdOptions.per_page)) {
|
|
logError(`Error: --per_page must be one of ${validPages.join(', ')}.`);
|
|
if (!quietMode) process.exit(1); else throw new Error("Invalid per_page");
|
|
}
|
|
const minCost = parseFloat(cmdOptions.minimum_cost);
|
|
if (isNaN(minCost)) {
|
|
logError("Error: --minimum_cost must be a number.");
|
|
if (!quietMode) process.exit(1); else throw new Error("Invalid minimum_cost");
|
|
}
|
|
|
|
const baseUrl = 'https://www.ebay.com/sch/i.html?_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10';
|
|
const url = `${baseUrl}&_ipg=${cmdOptions.per_page}&_udlo=${minCost.toFixed(2)}`;
|
|
logMessage(`Constructed URL for 'latest': ${url}`);
|
|
await runScraping({ url: url, saveFile: globalOptions.save });
|
|
}
|
|
});
|
|
|
|
// Handle URL as an argument. This will act as the default action if no other command is matched.
|
|
program
|
|
.argument('[url]', 'The full eBay search URL to scrape.')
|
|
.action(async (url, cmdOptions) => { // cmdOptions here are the global ones if no command specified
|
|
const globalOptions = program.opts();
|
|
if (globalOptions.only_json) quietMode = true;
|
|
|
|
// If 'url' is undefined here, it means no command and no URL was provided.
|
|
// 'latest' command has its own action, so this won't run for 'latest'.
|
|
if (globalOptions.load) {
|
|
logMessage("Using --load, any provided URL argument will be ignored.");
|
|
await runScraping({ htmlFile: globalOptions.load, saveFile: globalOptions.save });
|
|
} else if (url) {
|
|
await runScraping({ url: url, saveFile: globalOptions.save });
|
|
} else {
|
|
// If no URL, no --load, and not the 'latest' command, show help.
|
|
// This condition means no specific action was determined.
|
|
if (!program.args.find(arg => program.commands.map(c => c.name()).includes(arg))) {
|
|
program.help();
|
|
}
|
|
}
|
|
});
|
|
|
|
// Add help text to guide users for subcommand help
|
|
program.addHelpText('after', `
|
|
Example calls:
|
|
$ ebay-scraper latest --per_page 120 --minimum_cost 50
|
|
$ ebay-scraper latest --help
|
|
$ ebay-scraper "https://www.ebay.com/sch/i.html?_nkw=ssd"
|
|
$ ebay-scraper --load saved_page.html --only_json | jq .
|
|
$ ebay-scraper --save current_page.html "https://www.ebay.com/sch/i.html?_nkw=hdd"`);
|
|
|
|
|
|
// --- Wrapper to run scraping and print results ---
|
|
async function runScraping(options) {
|
|
try {
|
|
const data = await scrapeEbay(options);
|
|
if (quietMode) {
|
|
// Only output JSON string, no extra newlines or messages
|
|
process.stdout.write(JSON.stringify(data, null, 2));
|
|
} else {
|
|
if (data && data.length > 0) {
|
|
console.log(JSON.stringify(data, null, 2));
|
|
} else {
|
|
logMessage("No data extracted or a critical error occurred during scraping.");
|
|
}
|
|
}
|
|
} catch (e) {
|
|
logError(`Critical error in runScraping: ${e.message}`);
|
|
if (!quietMode && e.stack) console.error(e.stack);
|
|
if (quietMode) { // Ensure valid JSON output even on error for piping
|
|
process.stdout.write(JSON.stringify({error: e.message, data: []}));
|
|
}
|
|
}
|
|
}
|
|
|
|
// --- Parse Arguments and Run ---
|
|
(async () => {
|
|
try {
|
|
await program.parseAsync(process.argv);
|
|
// If no command was matched by commander and no URL argument was given,
|
|
// and it's not just options like --version or --help that commander handles.
|
|
const knownCommands = program.commands.map(cmd => cmd.name());
|
|
const userArgs = process.argv.slice(2);
|
|
const potentialCommand = userArgs.find(arg => !arg.startsWith('-'));
|
|
|
|
if (userArgs.length > 0 && !knownCommands.includes(potentialCommand) && !program.args.includes(potentialCommand) && !program.opts().load && potentialCommand) {
|
|
// This case handles if a user types something that isn't a command or a URL after options.
|
|
// Example: `node script.js --only_json somegibberish`
|
|
// However, the default argument [url] should catch most of these.
|
|
// If a URL-like string is passed, it will be caught by the .argument('[url]') action.
|
|
} else if (process.argv.slice(2).length === 0) { // No arguments at all
|
|
program.help();
|
|
}
|
|
|
|
} catch (error) {
|
|
logError(`Command parsing error: ${error.message}`);
|
|
if (!quietMode && error.stack) console.error(error.stack);
|
|
if (quietMode) {
|
|
process.stdout.write(JSON.stringify({error: error.message, data: []}));
|
|
} else {
|
|
process.exit(1);
|
|
}
|
|
}
|
|
})();
|