// ebay_command_line_tool.js V4.1 // Node.js script with commands to scrape eBay and output JSON. // Images are now saved preserving their URL path structure within the save directory. const puppeteer = require('puppeteer'); const fs = require('fs'); const path = require('path'); const { Command } = require('commander'); const https = require('https'); // For downloading images const http = require('http'); // For downloading images (fallback) const { URL } = require('url'); // For parsing image URLs // --- Load Core Script --- const coreScriptPath = path.join(__dirname, 'ebay_core.js'); let ebayCoreScriptContent; try { ebayCoreScriptContent = fs.readFileSync(coreScriptPath, 'utf8'); if (!ebayCoreScriptContent) throw new Error("ebay_core.js is empty."); } catch (e) { console.error(`Critical Error: Could not read ebay_core.js: ${e.message}`); process.exit(1); } let quietMode = false; function logMessage(message) { if (!quietMode) console.log(message); } function logError(message) { if (!quietMode) console.error(message); } // --- Image Downloading Function (Updated) --- async function downloadImage(imageUrl, baseSaveDirectory) { if (!imageUrl) return; try { const parsedUrl = new URL(imageUrl); // Get the full path from the URL (e.g., /images/g/5okAAeSwIGdoN8Ed/s-l500.webp) // Ensure leading slash is removed for path.join to work as expected relative to baseSaveDirectory const imagePathFromUrl = parsedUrl.pathname.startsWith('/') ? parsedUrl.pathname.substring(1) : parsedUrl.pathname; // Separate the directory part and the filename part from the URL path const imageName = path.basename(imagePathFromUrl); const imageSubdirectory = path.dirname(imagePathFromUrl); // Construct the full local directory path const fullLocalDirectory = path.join(baseSaveDirectory, imageSubdirectory); const fullLocalImagePath = path.join(fullLocalDirectory, imageName); // Ensure directory exists if (!fs.existsSync(fullLocalDirectory)) { fs.mkdirSync(fullLocalDirectory, { recursive: true }); logMessage(`Created image directory: ${fullLocalDirectory}`); } // Check if file already exists to avoid re-downloading (optional, can be useful) // if (fs.existsSync(fullLocalImagePath)) { // logMessage(`Image already exists, skipping: ${fullLocalImagePath}`); // return Promise.resolve(); // } const fileStream = fs.createWriteStream(fullLocalImagePath); const protocol = parsedUrl.protocol === 'https:' ? https : http; return new Promise((resolve, reject) => { const request = protocol.get(imageUrl, (response) => { if (response.statusCode !== 200) { logError(`Failed to download image ${imageUrl}. Status: ${response.statusCode}`); response.resume(); // Consume response data to free up resources reject(new Error(`Status code ${response.statusCode} for ${imageUrl}`)); return; } response.pipe(fileStream); fileStream.on('finish', () => { fileStream.close(); // close() is async, call resolve after it's done logMessage(`Downloaded image: ${fullLocalImagePath}`); resolve(); }); fileStream.on('error', (err) => { // Handle stream errors logError(`Error writing image file ${fullLocalImagePath}: ${err.message}`); fs.unlink(fullLocalImagePath, () => {}); // Attempt to delete partial file reject(err); }); }); request.on('error', (err) => { // Handle request errors logError(`Error downloading image ${imageUrl}: ${err.message}`); // No partial file to unlink here as the request itself failed reject(err); }); // Set a timeout for the request request.setTimeout(30000, () => { // 30 seconds timeout request.destroy(); // Destroy the request object on timeout logError(`Timeout downloading image ${imageUrl}`); reject(new Error(`Timeout downloading image ${imageUrl}`)); }); }); } catch (error) { logError(`Error processing image URL ${imageUrl}: ${error.message}`); return Promise.reject(error); // Propagate the error } } // --- Main Scraping Function --- async function scrapeEbay({ url = null, htmlFile = null, saveFile = null }) { logMessage("Starting scraping process..."); let browser; try { browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'] }); const page = await browser.newPage(); await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'); let htmlContentToParse; if (htmlFile) { logMessage(`Loading HTML from ${htmlFile}...`); htmlContentToParse = fs.readFileSync(htmlFile, 'utf8'); await page.setRequestInterception(true); page.on('request', (request) => { request.abort(); }); await page.setContent(htmlContentToParse, { waitUntil: 'domcontentloaded' }); logMessage("HTML loaded. Network requests blocked."); } else if (url) { logMessage(`Navigating to ${url}...`); await page.goto(url, { waitUntil: 'networkidle2', timeout: 90000 }); logMessage("Navigation successful."); htmlContentToParse = await page.content(); logMessage("Page content retrieved."); if (saveFile && htmlContentToParse) { logMessage(`Saving HTML to ${saveFile}...`); fs.writeFileSync(saveFile, htmlContentToParse, 'utf8'); logMessage("HTML saved."); } } else { throw new Error("Internal Error: Neither URL nor HTML file was provided."); } logMessage("Injecting core parser script..."); await page.evaluate(ebayCoreScriptContent); logMessage("Core script injected. Extracting data..."); const extractedResults = await page.evaluate(() => { if (typeof window.EbayParser === 'undefined' || typeof window.EbayParser.extractDataFromPage !== 'function') { throw new Error("EbayParser not found!"); } return window.EbayParser.extractDataFromPage(); }); logMessage(`Data extraction complete. Found ${extractedResults.length} items.`); // If HTML was fetched and --save was used, now download images if (url && saveFile && extractedResults.length > 0) { const baseSaveName = path.parse(saveFile).name; // e.g., "foo2" // The main directory for this save operation (e.g., "foo2/") const mainImageSaveDirectory = path.join(path.dirname(saveFile), baseSaveName); logMessage(`Downloading images for ${baseSaveName} into subdirectories of ${mainImageSaveDirectory}...`); const downloadPromises = []; for (const item of extractedResults) { if (item.image_url) { // Pass the mainImageSaveDirectory as the base for creating nested structure downloadPromises.push( downloadImage(item.image_url, mainImageSaveDirectory).catch(e => { logError(`Skipping image download for item ID ${item.itemId || 'unknown'} (URL: ${item.image_url}) due to error: ${e.message}`); }) ); } } await Promise.all(downloadPromises); // Wait for all image downloads to attempt completion logMessage("Image download process finished."); } return extractedResults; } catch (e) { logError(`Scraping process error: ${e.message}`); if (!quietMode && e.stack) console.error(e.stack); return []; } finally { if (browser) { await browser.close(); logMessage("Browser closed."); } } } const program = new Command(); program .name('ebay-scraper') .description('Scrapes eBay search results.') .version('4.1.0') // Version bump .option('--save ', 'Save scraped HTML to a file (and download images if fetching from URL).') .option('--load ', 'Load HTML from a file (disables network). Image download will not occur with --load.') .option('--only_json', 'Suppress informational logs, output only final JSON.', false) .on('option:only_json', () => { quietMode = true; }); program .command('latest') .description('Scrapes latest listings. Use "ebay-scraper latest --help" for options.') .option('--per_page ', 'Items per page (60, 120, or 240)', '60') .option('--minimum_cost ', 'Minimum cost (e.g., 50.00)', '0.00') .action(async (cmdOptions) => { const globalOptions = program.opts(); if (globalOptions.only_json) quietMode = true; if (globalOptions.load) { logMessage("Using --load for 'latest'. URL generation options ignored. Images will not be downloaded."); await runScraping({ htmlFile: globalOptions.load, saveFile: globalOptions.save }); } else { const validPages = ['60', '120', '240']; if (!validPages.includes(cmdOptions.per_page)) { logError(`Error: --per_page must be one of ${validPages.join(', ')}.`); if (!quietMode) process.exit(1); else throw new Error("Invalid per_page"); } const minCost = parseFloat(cmdOptions.minimum_cost); if (isNaN(minCost)) { logError("Error: --minimum_cost must be a number."); if (!quietMode) process.exit(1); else throw new Error("Invalid minimum_cost"); } const baseUrl = 'https://www.ebay.com/sch/i.html?_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10'; const url = `${baseUrl}&_ipg=${cmdOptions.per_page}&_udlo=${minCost.toFixed(2)}`; logMessage(`Constructed URL for 'latest': ${url}`); await runScraping({ url: url, saveFile: globalOptions.save }); } }); program .argument('[url]', 'The full eBay search URL to scrape.') .action(async (url) => { const globalOptions = program.opts(); if (globalOptions.only_json) quietMode = true; if (globalOptions.load) { logMessage("Using --load. Provided URL argument ignored. Images will not be downloaded."); await runScraping({ htmlFile: globalOptions.load, saveFile: globalOptions.save }); } else if (url) { await runScraping({ url: url, saveFile: globalOptions.save }); } else { // If no URL, no --load, and not the 'latest' command, show help. // Check if 'latest' was an argument. If so, commander handles its action. // If not, and no URL, then show help. const isLatestCommand = process.argv.includes('latest'); if (!isLatestCommand) { program.help(); } } }); program.addHelpText('after', ` Example calls: $ ebay-scraper latest --per_page 120 $ ebay-scraper "https://www.ebay.com/sch/i.html?_nkw=ssd" $ ebay-scraper --load page.html --only_json | jq . $ ebay-scraper --save page.html "https://www.ebay.com/sch/i.html?_nkw=hdd"`); async function runScraping(options) { try { const data = await scrapeEbay(options); if (quietMode) { process.stdout.write(JSON.stringify(data, null, 2)); } else { if (data && data.length > 0) console.log(JSON.stringify(data, null, 2)); else logMessage("No data extracted or a critical error occurred."); } } catch (e) { logError(`Critical error in runScraping: ${e.message}`); if (!quietMode && e.stack) console.error(e.stack); if (quietMode) process.stdout.write(JSON.stringify({error: e.message, data: []})); } } (async () => { try { await program.parseAsync(process.argv); // If no command was specified and no URL, Commander's default help might not trigger if only options are present. // This ensures help is shown if no actionable arguments are given. const args = process.argv.slice(2); const hasActionableArg = args.some(arg => !arg.startsWith('-') || program.commands.some(cmd => cmd.name() === arg)); if (args.length > 0 && !hasActionableArg && !program.opts().load) { // If only options like --only_json but no command/url/load program.help(); } else if (args.length === 0) { // No arguments at all program.help(); } } catch (error) { logError(`Command parsing error: ${error.message}`); if (!quietMode && error.stack) console.error(error.stack); if (quietMode) process.stdout.write(JSON.stringify({error: error.message, data: []})); else process.exit(1); } })();