Files
greasemonkey_scripts/ebay_command_line_tool.js

280 lines
13 KiB
JavaScript

// ebay_command_line_tool.js V4.1
// Node.js script with commands to scrape eBay and output JSON.
// Images are now saved preserving their URL path structure within the save directory.
const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');
const { Command } = require('commander');
const https = require('https'); // For downloading images
const http = require('http'); // For downloading images (fallback)
const { URL } = require('url'); // For parsing image URLs
// --- Load Core Script ---
const coreScriptPath = path.join(__dirname, 'ebay_core.js');
let ebayCoreScriptContent;
try {
ebayCoreScriptContent = fs.readFileSync(coreScriptPath, 'utf8');
if (!ebayCoreScriptContent) throw new Error("ebay_core.js is empty.");
} catch (e) {
console.error(`Critical Error: Could not read ebay_core.js: ${e.message}`);
process.exit(1);
}
let quietMode = false;
function logMessage(message) { if (!quietMode) console.log(message); }
function logError(message) { if (!quietMode) console.error(message); }
// --- Image Downloading Function (Updated) ---
async function downloadImage(imageUrl, baseSaveDirectory) {
if (!imageUrl) return;
try {
const parsedUrl = new URL(imageUrl);
// Get the full path from the URL (e.g., /images/g/5okAAeSwIGdoN8Ed/s-l500.webp)
// Ensure leading slash is removed for path.join to work as expected relative to baseSaveDirectory
const imagePathFromUrl = parsedUrl.pathname.startsWith('/') ? parsedUrl.pathname.substring(1) : parsedUrl.pathname;
// Separate the directory part and the filename part from the URL path
const imageName = path.basename(imagePathFromUrl);
const imageSubdirectory = path.dirname(imagePathFromUrl);
// Construct the full local directory path
const fullLocalDirectory = path.join(baseSaveDirectory, imageSubdirectory);
const fullLocalImagePath = path.join(fullLocalDirectory, imageName);
// Ensure directory exists
if (!fs.existsSync(fullLocalDirectory)) {
fs.mkdirSync(fullLocalDirectory, { recursive: true });
logMessage(`Created image directory: ${fullLocalDirectory}`);
}
// Check if file already exists to avoid re-downloading (optional, can be useful)
// if (fs.existsSync(fullLocalImagePath)) {
// logMessage(`Image already exists, skipping: ${fullLocalImagePath}`);
// return Promise.resolve();
// }
const fileStream = fs.createWriteStream(fullLocalImagePath);
const protocol = parsedUrl.protocol === 'https:' ? https : http;
return new Promise((resolve, reject) => {
const request = protocol.get(imageUrl, (response) => {
if (response.statusCode !== 200) {
logError(`Failed to download image ${imageUrl}. Status: ${response.statusCode}`);
response.resume(); // Consume response data to free up resources
reject(new Error(`Status code ${response.statusCode} for ${imageUrl}`));
return;
}
response.pipe(fileStream);
fileStream.on('finish', () => {
fileStream.close(); // close() is async, call resolve after it's done
logMessage(`Downloaded image: ${fullLocalImagePath}`);
resolve();
});
fileStream.on('error', (err) => { // Handle stream errors
logError(`Error writing image file ${fullLocalImagePath}: ${err.message}`);
fs.unlink(fullLocalImagePath, () => {}); // Attempt to delete partial file
reject(err);
});
});
request.on('error', (err) => { // Handle request errors
logError(`Error downloading image ${imageUrl}: ${err.message}`);
// No partial file to unlink here as the request itself failed
reject(err);
});
// Set a timeout for the request
request.setTimeout(30000, () => { // 30 seconds timeout
request.destroy(); // Destroy the request object on timeout
logError(`Timeout downloading image ${imageUrl}`);
reject(new Error(`Timeout downloading image ${imageUrl}`));
});
});
} catch (error) {
logError(`Error processing image URL ${imageUrl}: ${error.message}`);
return Promise.reject(error); // Propagate the error
}
}
// --- Main Scraping Function ---
async function scrapeEbay({ url = null, htmlFile = null, saveFile = null }) {
logMessage("Starting scraping process...");
let browser;
try {
browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'] });
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36');
let htmlContentToParse;
if (htmlFile) {
logMessage(`Loading HTML from ${htmlFile}...`);
htmlContentToParse = fs.readFileSync(htmlFile, 'utf8');
await page.setRequestInterception(true);
page.on('request', (request) => { request.abort(); });
await page.setContent(htmlContentToParse, { waitUntil: 'domcontentloaded' });
logMessage("HTML loaded. Network requests blocked.");
} else if (url) {
logMessage(`Navigating to ${url}...`);
await page.goto(url, { waitUntil: 'networkidle2', timeout: 90000 });
logMessage("Navigation successful.");
htmlContentToParse = await page.content();
logMessage("Page content retrieved.");
if (saveFile && htmlContentToParse) {
logMessage(`Saving HTML to ${saveFile}...`);
fs.writeFileSync(saveFile, htmlContentToParse, 'utf8');
logMessage("HTML saved.");
}
} else {
throw new Error("Internal Error: Neither URL nor HTML file was provided.");
}
logMessage("Injecting core parser script...");
await page.evaluate(ebayCoreScriptContent);
logMessage("Core script injected. Extracting data...");
const extractedResults = await page.evaluate(() => {
if (typeof window.EbayParser === 'undefined' || typeof window.EbayParser.extractDataFromPage !== 'function') {
throw new Error("EbayParser not found!");
}
return window.EbayParser.extractDataFromPage();
});
logMessage(`Data extraction complete. Found ${extractedResults.length} items.`);
// If HTML was fetched and --save was used, now download images
if (url && saveFile && extractedResults.length > 0) {
const baseSaveName = path.parse(saveFile).name; // e.g., "foo2"
// The main directory for this save operation (e.g., "foo2/")
const mainImageSaveDirectory = path.join(path.dirname(saveFile), baseSaveName);
logMessage(`Downloading images for ${baseSaveName} into subdirectories of ${mainImageSaveDirectory}...`);
const downloadPromises = [];
for (const item of extractedResults) {
if (item.image_url) {
// Pass the mainImageSaveDirectory as the base for creating nested structure
downloadPromises.push(
downloadImage(item.image_url, mainImageSaveDirectory).catch(e => {
logError(`Skipping image download for item ID ${item.itemId || 'unknown'} (URL: ${item.image_url}) due to error: ${e.message}`);
})
);
}
}
await Promise.all(downloadPromises); // Wait for all image downloads to attempt completion
logMessage("Image download process finished.");
}
return extractedResults;
} catch (e) {
logError(`Scraping process error: ${e.message}`);
if (!quietMode && e.stack) console.error(e.stack);
return [];
} finally {
if (browser) {
await browser.close();
logMessage("Browser closed.");
}
}
}
const program = new Command();
program
.name('ebay-scraper')
.description('Scrapes eBay search results.')
.version('4.1.0') // Version bump
.option('--save <filename>', 'Save scraped HTML to a file (and download images if fetching from URL).')
.option('--load <filename>', 'Load HTML from a file (disables network). Image download will not occur with --load.')
.option('--only_json', 'Suppress informational logs, output only final JSON.', false)
.on('option:only_json', () => { quietMode = true; });
program
.command('latest')
.description('Scrapes latest listings. Use "ebay-scraper latest --help" for options.')
.option('--per_page <number>', 'Items per page (60, 120, or 240)', '60')
.option('--minimum_cost <number>', 'Minimum cost (e.g., 50.00)', '0.00')
.action(async (cmdOptions) => {
const globalOptions = program.opts();
if (globalOptions.only_json) quietMode = true;
if (globalOptions.load) {
logMessage("Using --load for 'latest'. URL generation options ignored. Images will not be downloaded.");
await runScraping({ htmlFile: globalOptions.load, saveFile: globalOptions.save });
} else {
const validPages = ['60', '120', '240'];
if (!validPages.includes(cmdOptions.per_page)) {
logError(`Error: --per_page must be one of ${validPages.join(', ')}.`);
if (!quietMode) process.exit(1); else throw new Error("Invalid per_page");
}
const minCost = parseFloat(cmdOptions.minimum_cost);
if (isNaN(minCost)) {
logError("Error: --minimum_cost must be a number.");
if (!quietMode) process.exit(1); else throw new Error("Invalid minimum_cost");
}
const baseUrl = 'https://www.ebay.com/sch/i.html?_nkw=&_sacat=175669&_from=R40&_fsrp=1&LH_PrefLoc=3&imm=1&_sop=10';
const url = `${baseUrl}&_ipg=${cmdOptions.per_page}&_udlo=${minCost.toFixed(2)}`;
logMessage(`Constructed URL for 'latest': ${url}`);
await runScraping({ url: url, saveFile: globalOptions.save });
}
});
program
.argument('[url]', 'The full eBay search URL to scrape.')
.action(async (url) => {
const globalOptions = program.opts();
if (globalOptions.only_json) quietMode = true;
if (globalOptions.load) {
logMessage("Using --load. Provided URL argument ignored. Images will not be downloaded.");
await runScraping({ htmlFile: globalOptions.load, saveFile: globalOptions.save });
} else if (url) {
await runScraping({ url: url, saveFile: globalOptions.save });
} else {
// If no URL, no --load, and not the 'latest' command, show help.
// Check if 'latest' was an argument. If so, commander handles its action.
// If not, and no URL, then show help.
const isLatestCommand = process.argv.includes('latest');
if (!isLatestCommand) {
program.help();
}
}
});
program.addHelpText('after', `
Example calls:
$ ebay-scraper latest --per_page 120
$ ebay-scraper "https://www.ebay.com/sch/i.html?_nkw=ssd"
$ ebay-scraper --load page.html --only_json | jq .
$ ebay-scraper --save page.html "https://www.ebay.com/sch/i.html?_nkw=hdd"`);
async function runScraping(options) {
try {
const data = await scrapeEbay(options);
if (quietMode) {
process.stdout.write(JSON.stringify(data, null, 2));
} else {
if (data && data.length > 0) console.log(JSON.stringify(data, null, 2));
else logMessage("No data extracted or a critical error occurred.");
}
} catch (e) {
logError(`Critical error in runScraping: ${e.message}`);
if (!quietMode && e.stack) console.error(e.stack);
if (quietMode) process.stdout.write(JSON.stringify({error: e.message, data: []}));
}
}
(async () => {
try {
await program.parseAsync(process.argv);
// If no command was specified and no URL, Commander's default help might not trigger if only options are present.
// This ensures help is shown if no actionable arguments are given.
const args = process.argv.slice(2);
const hasActionableArg = args.some(arg => !arg.startsWith('-') || program.commands.some(cmd => cmd.name() === arg));
if (args.length > 0 && !hasActionableArg && !program.opts().load) { // If only options like --only_json but no command/url/load
program.help();
} else if (args.length === 0) { // No arguments at all
program.help();
}
} catch (error) {
logError(`Command parsing error: ${error.message}`);
if (!quietMode && error.stack) console.error(error.stack);
if (quietMode) process.stdout.write(JSON.stringify({error: error.message, data: []}));
else process.exit(1);
}
})();