const { promisify: promisify } = require("util");
const { exec: exec } = require("child_process");
const { fileTypeFromBuffer: fileTypeFromBuffer } = require("file-type");
const { imageSize: imageSize } = require("image-size");
const { readFileSync: readFileSync } = require("node:fs");
const fs = require("fs/promises");
const pdfParse = require("pdf-parse");
const mammoth = require("mammoth");
const XLSX = require("xlsx");
const officeparser = require("officeparser");
const execPromise = promisify(exec);
/**
* @module metadata-helper
* @description Provides functions to get metadata for various file types.
*/
/**
* @typedef {object} imageData
* @property {number} width - The width of the image in pixels.
* @property {number} height - The height of the image in pixels.
* @property {string} orientation - The image orientation ('portrait' or 'landscape').
* @property {string} type - The file extension of the image (e.g., 'png', 'jpeg').
*/
/**
* @typedef {object} mediaData
* @property {number|null} duration - The duration of the media in seconds, or null if unavailable.
* @property {string} container - The format name of the media container.
* @property {object} [video] - Video stream metadata.
* @property {number} [video.width] - The width of the video.
* @property {number} [video.height] - The height of the video.
* @property {string} [video.codec] - The video codec name.
* @property {object} [audio] - Audio stream metadata.
* @property {string} [audio.codec] - The audio codec name.
* @property {number|null} [audio.bit_rate] - The audio bitrate in bits per second, or null if unavailable.
*/
/**
* @typedef {object} pdfData
* @property {number} pages - The number of pages in the PDF.
* @property {object} info - The PDF document's info dictionary (e.g., title, author).
*/
/**
* @typedef {object} excelData
* @property {number} sheets - The number of sheets in the Excel workbook.
* @property {string[]} sheetNames - An array of the names of the worksheets.
*/
/**
* @typedef {object} powerPointData
* @property {number|null} slidesCount - The number of slides in the presentation, or null if unable to parse.
*/
/**
* @typedef {object} wordData
* @property {number} wordCount - The number of words in the document.
* @property {number} charCount - The number of characters in the document.
*/
/**
* @typedef {object} textData
* @property {number} lines - The number of lines in the text file.
* @property {number} charCount - The number of characters in the text file.
* @property {string} firstLine - The first 100 characters of the first line of the file.
*/
/**
* @typedef {object} binaryData
* @property {string} platformInfo - A descriptive string about the file from the 'file' command.
*/
/**
* Uses the `image-size` library to extract image properties.
* @param {string} filepath - The path to the image file.
* @returns {Promise<imageData>} A promise that resolves to the image metadata.
*/
const handleImage = async (filepath) => {
const size = imageSize(readFileSync(filepath));
return {
width: size.width,
height: size.height,
orientation: size.width > size.height ? "landscape" : "portrait",
type: size.type,
};
};
/**
* Uses `ffprobe` to extract metadata for video and audio files.
* @param {string} filepath - The path to the media file.
* @returns {Promise<mediaData>} A promise that resolves to the media metadata.
*/
const handleMedia = async (filepath) => {
const command = `ffprobe -v error -show_format -show_streams -of json "${filepath}"`;
const { stdout } = await execPromise(command);
const data = JSON.parse(stdout);
const format = data.format;
const videoStream = data.streams.find((stream) => stream.codec_type === "video");
const audioStream = data.streams.find((stream) => stream.codec_type === "audio");
const result = {
duration: format.duration ? parseFloat(format.duration) : null,
container: format.format_name,
};
if (videoStream) {
result.video = { width: videoStream.width, height: videoStream.height, codec: videoStream.codec_name };
}
if (audioStream) {
result.audio = {
codec: audioStream.codec_name,
bit_rate: audioStream.bit_rate ? parseInt(audioStream.bit_rate, 10) : null,
};
}
return result;
};
/**
* Parses a PDF file to extract page count and document info.
* @param {string} filepath - The path to the PDF file.
* @returns {Promise<pdfData>} A promise that resolves to the PDF metadata.
*/
const handlePdf = async (filepath) => {
const buffer = readFileSync(filepath);
const data = await pdfParse(buffer);
return { pages: data.numpages, info: data.info };
};
/**
* Parses an Excel file to get the number of sheets and their names.
* @param {string} filepath - The path to the Excel file.
* @returns {Promise<excelData>} A promise that resolves to the Excel metadata.
*/
const handleExcel = async (filepath) => {
const workbook = XLSX.readFile(filepath);
return { sheets: workbook.SheetNames.length, sheetNames: workbook.SheetNames };
};
/**
* Parses a PowerPoint file to get the number of slides.
* @param {string} filepath - The path to the PowerPoint file.
* @returns {Promise<powerPointData>} A promise that resolves to the PowerPoint metadata.
*/
const handlePowerPoint = async (filepath) => {
const { slides } = await officeparser.parse(filepath);
return { slidesCount: slides && slides.length > 0 ? slides.length : null };
};
/**
* Extracts raw text from a Word document to get word and character counts.
* @param {string} filepath - The path to the Word file.
* @returns {Promise<wordData>} A promise that resolves to the Word document metadata.
*/
const handleWord = async (filepath) => {
const { value } = await mammoth.extractRawText({ path: filepath });
const wordCount = value.split(/\s+/).filter((word) => word.length > 0).length;
const charCount = value.length;
return { wordCount, charCount };
};
/**
* Reads a text file and extracts basic statistics.
* @param {string} filepath - The path to the text file.
* @returns {Promise<textData>} A promise that resolves to the text file metadata.
*/
const handleText = async (filepath) => {
const content = await fs.readFile(filepath, "utf-8");
const lines = content.split("\n");
return {
lines: lines.length,
charCount: content.length,
firstLine: lines[0].substring(0, 100),
};
};
/**
* A fallback handler that uses the system's 'file' command to get generic information.
* @param {string} filepath - The path to the binary file.
* @returns {Promise<binaryData>} A promise that resolves to the file's platform info.
*/
const handleBinary = async (filepath) => {
try {
const { stdout } = await execPromise(`file "${filepath}"`);
const platformInfo = stdout.split(": ")[1].trim();
return { platformInfo };
} catch (error) {
console.warn(
`'file' command failed to run. Check if it's installed and in your PATH.`,
);
return { platformInfo: "unknown" };
}
};
/**
* A map of MIME type prefixes to their corresponding metadata handler functions.
* @type {Object<string, Function>}
*/
const handlers = {
"image/": handleImage,
"video/": handleMedia,
"audio/": handleMedia,
"application/pdf": handlePdf,
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": handleExcel,
"application/vnd.ms-excel": handleExcel,
"application/vnd.openxmlformats-officedocument.presentationml.presentation": handlePowerPoint,
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": handleWord,
"text/": handleText,
};
/**
* Fetches metadata for a given file. It first tries to determine the file type and then
* uses the appropriate handler. If no specific handler is found, it uses a fallback handler.
* @param {string} filepath - The path to the file.
* @param {string} [mimeType] - Optional MIME type to bypass file type detection.
* @returns {Promise<object>} A promise that resolves to an object containing the file's metadata.
*/
const getMetadata = async (filepath, mimeType) => {
let metadata = { mimeType: mimeType || "application/octet-stream" };
try {
if (metadata.mimeType === "application/octet-stream") {
const buffer = await fs.readFile(filepath);
const fileType = await fileTypeFromBuffer(buffer);
if (fileType) {
metadata.mimeType = fileType.mime;
}
}
const handlerKey = Object.keys(handlers).find(
(key) => metadata.mimeType === key || metadata.mimeType.startsWith(key),
);
const handler = handlers[handlerKey];
if (handler) {
const handlerMetadata = await handler(filepath);
metadata = { ...metadata, ...handlerMetadata };
} else {
const handlerMetadata = await handleBinary(filepath);
metadata = { ...metadata, ...handlerMetadata };
}
} catch (error) {
console.warn(`Error extracting metadata for file '${filepath}': ${error.message}`);
return metadata;
}
return metadata;
};
module.exports = getMetadata;