280 lines
13 KiB
JavaScript
280 lines
13 KiB
JavaScript
// ebay_command_line_tool.js V4.1
|
|
// Node.js script with commands to scrape eBay and output JSON.
|
|
// Images are now saved preserving their URL path structure within the save directory.
|
|
|
|
const puppeteer = require('puppeteer');
|
|
const fs = require('fs');
|
|
const path = require('path');
|
|
const { Command } = require('commander');
|
|
const https = require('https'); // For downloading images
|
|
const http = require('http'); // For downloading images (fallback)
|
|
const { URL } = require('url'); // For parsing image URLs
|
|
|
|
// --- Load Core Script ---
|
|
const coreScriptPath = path.join(__dirname, 'ebay_core.js');
|
|
let ebayCoreScriptContent;
|
|
try {
|
|
ebayCoreScriptContent = fs.readFileSync(coreScriptPath, 'utf8');
|
|
if (!ebayCoreScriptContent) throw new Error("ebay_core.js is empty.");
|
|
} catch (e) {
|
|
console.error(`Critical Error: Could not read ebay_core.js: ${e.message}`);
|
|
process.exit(1);
|
|
}
|
|
|
|
let quietMode = false;
|
|
function logMessage(message) { if (!quietMode) console.log(message); }
|
|
function logError(message) { if (!quietMode) console.error(message); }
|
|
|
|
// --- Image Downloading Function (Updated) ---
|
|
async function downloadImage(imageUrl, baseSaveDirectory) {
|
|
if (!imageUrl) return;
|
|
try {
|
|
const parsedUrl = new URL(imageUrl);
|
|
|
|
// Get the full path from the URL (e.g., /images/g/5okAAeSwIGdoN8Ed/s-l500.webp)
|
|
// Ensure leading slash is removed for path.join to work as expected relative to baseSaveDirectory
|
|
const imagePathFromUrl = parsedUrl.pathname.startsWith('/') ? parsedUrl.pathname.substring(1) : parsedUrl.pathname;
|
|
|
|
// Separate the directory part and the filename part from the URL path
|
|
const imageName = path.basename(imagePathFromUrl);
|
|
const imageSubdirectory = path.dirname(imagePathFromUrl);
|
|
|
|
// Construct the full local directory path
|
|
const fullLocalDirectory = path.join(baseSaveDirectory, imageSubdirectory);
|
|
const fullLocalImagePath = path.join(fullLocalDirectory, imageName);
|
|
|
|
// Ensure directory exists
|
|
if (!fs.existsSync(fullLocalDirectory)) {
|
|
fs.mkdirSync(fullLocalDirectory, { recursive: true });
|
|
logMessage(`Created image directory: ${fullLocalDirectory}`);
|
|
}
|
|
|
|
// Check if file already exists to avoid re-downloading (optional, can be useful)
|
|
// if (fs.existsSync(fullLocalImagePath)) {
|
|
// logMessage(`Image already exists, skipping: ${fullLocalImagePath}`);
|
|
// return Promise.resolve();
|
|
// }
|
|
|
|
const fileStream = fs.createWriteStream(fullLocalImagePath);
|
|
const protocol = parsedUrl.protocol === 'https:' ? https : http;
|
|
|
|
return new Promise((resolve, reject) => {
|
|
const request = protocol.get(imageUrl, (response) => {
|
|
if (response.statusCode !== 200) {
|
|
logError(`Failed to download image ${imageUrl}. Status: ${response.statusCode}`);
|
|
response.resume(); // Consume response data to free up resources
|
|
reject(new Error(`Status code ${response.statusCode} for ${imageUrl}`));
|
|
return;
|
|
}
|
|
response.pipe(fileStream);
|
|
fileStream.on('finish', () => {
|
|
fileStream.close(); // close() is async, call resolve after it's done
|
|
logMessage(`Downloaded image: ${fullLocalImagePath}`);
|
|
resolve();
|
|
});
|
|
fileStream.on('error', (err) => { // Handle stream errors
|
|
logError(`Error writing image file ${fullLocalImagePath}: ${err.message}`);
|
|
fs.unlink(fullLocalImagePath, () => {}); // Attempt to delete partial file
|
|
reject(err);
|
|
});
|
|
});
|
|
request.on('error', (err) => { // Handle request errors
|
|
logError(`Error downloading image ${imageUrl}: ${err.message}`);
|
|
// No partial file to unlink here as the request itself failed
|
|
reject(err);
|
|
});
|
|
// Set a timeout for the request
|
|
request.setTimeout(30000, () => { // 30 seconds timeout
|
|
request.destroy(); // Destroy the request object on timeout
|
|
logError(`Timeout downloading image ${imageUrl}`);
|
|
reject(new Error(`Timeout downloading image ${imageUrl}`));
|
|
});
|
|
});
|
|
} catch (error) {
|
|
logError(`Error processing image URL ${imageUrl}: ${error.message}`);
|
|
return Promise.reject(error); // Propagate the error
|
|
}
|
|
}
|
|
|
|
|
|
// --- Main Scraping Function ---
|
|
async function scrapeEbay({ url = null, htmlFile = null, saveFile = null }) {
|
|
logMessage("Starting scraping process...");
|
|
let browser;
|
|
try {
|
|
browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'] });
|
|
const page = await browser.newPage();
|
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36');
|
|
|
|
let htmlContentToParse;
|
|
if (htmlFile) {
|
|
logMessage(`Loading HTML from ${htmlFile}...`);
|
|
htmlContentToParse = fs.readFileSync(htmlFile, 'utf8');
|
|
await page.setRequestInterception(true);
|
|
page.on('request', (request) => { request.abort(); });
|
|
await page.setContent(htmlContentToParse, { waitUntil: 'domcontentloaded' });
|
|
logMessage("HTML loaded. Network requests blocked.");
|
|
} else if (url) {
|
|
logMessage(`Navigating to ${url}...`);
|
|
await page.goto(url, { waitUntil: 'networkidle2', timeout: 90000 });
|
|
logMessage("Navigation successful.");
|
|
htmlContentToParse = await page.content();
|
|
logMessage("Page content retrieved.");
|
|
if (saveFile && htmlContentToParse) {
|
|
logMessage(`Saving HTML to ${saveFile}...`);
|
|
fs.writeFileSync(saveFile, htmlContentToParse, 'utf8');
|
|
logMessage("HTML saved.");
|
|
}
|
|
} else {
|
|
throw new Error("Internal Error: Neither URL nor HTML file was provided.");
|
|
}
|
|
|
|
logMessage("Injecting core parser script...");
|
|
await page.evaluate(ebayCoreScriptContent);
|
|
logMessage("Core script injected. Extracting data...");
|
|
const extractedResults = await page.evaluate(() => {
|
|
if (typeof window.EbayParser === 'undefined' || typeof window.EbayParser.extractDataFromPage !== 'function') {
|
|
throw new Error("EbayParser not found!");
|
|
}
|
|
return window.EbayParser.extractDataFromPage();
|
|
});
|
|
logMessage(`Data extraction complete. Found ${extractedResults.length} items.`);
|
|
|
|
// If HTML was fetched and --save was used, now download images
|
|
if (url && saveFile && extractedResults.length > 0) {
|
|
const baseSaveName = path.parse(saveFile).name; // e.g., "foo2"
|
|
// The main directory for this save operation (e.g., "foo2/")
|
|
const mainImageSaveDirectory = path.join(path.dirname(saveFile), baseSaveName);
|
|
logMessage(`Downloading images for ${baseSaveName} into subdirectories of ${mainImageSaveDirectory}...`);
|
|
|
|
const downloadPromises = [];
|
|
for (const item of extractedResults) {
|
|
if (item.image_url) {
|
|
// Pass the mainImageSaveDirectory as the base for creating nested structure
|
|
downloadPromises.push(
|
|
downloadImage(item.image_url, mainImageSaveDirectory).catch(e => {
|
|
logError(`Skipping image download for item ID ${item.itemId || 'unknown'} (URL: ${item.image_url}) due to error: ${e.message}`);
|
|
})
|
|
);
|
|
}
|
|
}
|
|
await Promise.all(downloadPromises); // Wait for all image downloads to attempt completion
|
|
logMessage("Image download process finished.");
|
|
}
|
|
return extractedResults;
|
|
} catch (e) {
|
|
logError(`Scraping process error: ${e.message}`);
|
|
if (!quietMode && e.stack) console.error(e.stack);
|
|
return [];
|
|
} finally {
|
|
if (browser) {
|
|
await browser.close();
|
|
logMessage("Browser closed.");
|
|
}
|
|
}
|
|
}
|
|
|
|
const program = new Command();
|
|
program
|
|
.name('ebay-scraper')
|
|
.description('Scrapes eBay search results.')
|
|
.version('4.1.0') // Version bump
|
|
.option('--save <filename>', 'Save scraped HTML to a file (and download images if fetching from URL).')
|
|
.option('--load <filename>', 'Load HTML from a file (disables network). Image download will not occur with --load.')
|
|
.option('--only_json', 'Suppress informational logs, output only final JSON.', false)
|
|
.on('option:only_json', () => { quietMode = true; });
|
|
|
|
program
|
|
.command('latest')
|
|
.description('Scrapes latest listings. Use "ebay-scraper latest --help" for options.')
|
|
.option('--per_page <number>', 'Items per page (60, 120, or 240)', '60')
|
|
.option('--minimum_cost <number>', 'Minimum cost (e.g., 50.00)', '0.00')
|
|
.action(async (cmdOptions) => {
|
|
const globalOptions = program.opts();
|
|
if (globalOptions.only_json) quietMode = true;
|
|
if (globalOptions.load) {
|
|
logMessage("Using --load for 'latest'. URL generation options ignored. Images will not be downloaded.");
|
|
await runScraping({ htmlFile: globalOptions.load, saveFile: globalOptions.save });
|
|
} else {
|
|
const validPages = ['60', '120', '240'];
|
|
if (!validPages.includes(cmdOptions.per_page)) {
|
|
logError(`Error: --per_page must be one of ${validPages.join(', ')}.`);
|
|
if (!quietMode) process.exit(1); else throw new Error("Invalid per_page");
|
|
}
|
|
const minCost = parseFloat(cmdOptions.minimum_cost);
|
|
if (isNaN(minCost)) {
|
|
logError("Error: --minimum_cost must be a number.");
|
|
if (!quietMode) process.exit(1); else throw new Error("Invalid minimum_cost");
|
|
}
|
|
const baseUrl = 'https://www.ebay.com/sch/i.html?_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10';
|
|
const url = `${baseUrl}&_ipg=${cmdOptions.per_page}&_udlo=${minCost.toFixed(2)}`;
|
|
logMessage(`Constructed URL for 'latest': ${url}`);
|
|
await runScraping({ url: url, saveFile: globalOptions.save });
|
|
}
|
|
});
|
|
|
|
program
|
|
.argument('[url]', 'The full eBay search URL to scrape.')
|
|
.action(async (url) => {
|
|
const globalOptions = program.opts();
|
|
if (globalOptions.only_json) quietMode = true;
|
|
if (globalOptions.load) {
|
|
logMessage("Using --load. Provided URL argument ignored. Images will not be downloaded.");
|
|
await runScraping({ htmlFile: globalOptions.load, saveFile: globalOptions.save });
|
|
} else if (url) {
|
|
await runScraping({ url: url, saveFile: globalOptions.save });
|
|
} else {
|
|
// If no URL, no --load, and not the 'latest' command, show help.
|
|
// Check if 'latest' was an argument. If so, commander handles its action.
|
|
// If not, and no URL, then show help.
|
|
const isLatestCommand = process.argv.includes('latest');
|
|
if (!isLatestCommand) {
|
|
program.help();
|
|
}
|
|
}
|
|
});
|
|
|
|
program.addHelpText('after', `
|
|
Example calls:
|
|
$ ebay-scraper latest --per_page 120
|
|
$ ebay-scraper "https://www.ebay.com/sch/i.html?_nkw=ssd"
|
|
$ ebay-scraper --load page.html --only_json | jq .
|
|
$ ebay-scraper --save page.html "https://www.ebay.com/sch/i.html?_nkw=hdd"`);
|
|
|
|
async function runScraping(options) {
|
|
try {
|
|
const data = await scrapeEbay(options);
|
|
if (quietMode) {
|
|
process.stdout.write(JSON.stringify(data, null, 2));
|
|
} else {
|
|
if (data && data.length > 0) console.log(JSON.stringify(data, null, 2));
|
|
else logMessage("No data extracted or a critical error occurred.");
|
|
}
|
|
} catch (e) {
|
|
logError(`Critical error in runScraping: ${e.message}`);
|
|
if (!quietMode && e.stack) console.error(e.stack);
|
|
if (quietMode) process.stdout.write(JSON.stringify({error: e.message, data: []}));
|
|
}
|
|
}
|
|
|
|
(async () => {
|
|
try {
|
|
await program.parseAsync(process.argv);
|
|
// If no command was specified and no URL, Commander's default help might not trigger if only options are present.
|
|
// This ensures help is shown if no actionable arguments are given.
|
|
const args = process.argv.slice(2);
|
|
const hasActionableArg = args.some(arg => !arg.startsWith('-') || program.commands.some(cmd => cmd.name() === arg));
|
|
if (args.length > 0 && !hasActionableArg && !program.opts().load) { // If only options like --only_json but no command/url/load
|
|
program.help();
|
|
} else if (args.length === 0) { // No arguments at all
|
|
program.help();
|
|
}
|
|
|
|
} catch (error) {
|
|
logError(`Command parsing error: ${error.message}`);
|
|
if (!quietMode && error.stack) console.error(error.stack);
|
|
if (quietMode) process.stdout.write(JSON.stringify({error: error.message, data: []}));
|
|
else process.exit(1);
|
|
}
|
|
})();
|