src/metadata-helper.js - Documentation

const { promisify: promisify } = require("util");
const { exec: exec } = require("child_process");
const { fileTypeFromBuffer: fileTypeFromBuffer } = require("file-type");
const { imageSize: imageSize } = require("image-size");
const { readFileSync: readFileSync } = require("node:fs");
const fs = require("fs/promises");
const pdfParse = require("pdf-parse");
const mammoth = require("mammoth");
const XLSX = require("xlsx");
const officeparser = require("officeparser");
const execPromise = promisify(exec);

/**
 * @module metadata-helper
 * @description Provides functions to get metadata for various file types.
 */

/**
 * @typedef {object} imageData
 * @property {number} width - The width of the image in pixels.
 * @property {number} height - The height of the image in pixels.
 * @property {string} orientation - The image orientation ('portrait' or 'landscape').
 * @property {string} type - The file extension of the image (e.g., 'png', 'jpeg').
 */

/**
 * @typedef {object} mediaData
 * @property {number|null} duration - The duration of the media in seconds, or null if unavailable.
 * @property {string} container - The format name of the media container.
 * @property {object} [video] - Video stream metadata.
 * @property {number} [video.width] - The width of the video.
 * @property {number} [video.height] - The height of the video.
 * @property {string} [video.codec] - The video codec name.
 * @property {object} [audio] - Audio stream metadata.
 * @property {string} [audio.codec] - The audio codec name.
 * @property {number|null} [audio.bit_rate] - The audio bitrate in bits per second, or null if unavailable.
 */

/**
 * @typedef {object} pdfData
 * @property {number} pages - The number of pages in the PDF.
 * @property {object} info - The PDF document's info dictionary (e.g., title, author).
 */

/**
 * @typedef {object} excelData
 * @property {number} sheets - The number of sheets in the Excel workbook.
 * @property {string[]} sheetNames - An array of the names of the worksheets.
 */

/**
 * @typedef {object} powerPointData
 * @property {number|null} slidesCount - The number of slides in the presentation, or null if unable to parse.
 */

/**
 * @typedef {object} wordData
 * @property {number} wordCount - The number of words in the document.
 * @property {number} charCount - The number of characters in the document.
 */

/**
 * @typedef {object} textData
 * @property {number} lines - The number of lines in the text file.
 * @property {number} charCount - The number of characters in the text file.
 * @property {string} firstLine - The first 100 characters of the first line of the file.
 */

/**
 * @typedef {object} binaryData
 * @property {string} platformInfo - A descriptive string about the file from the 'file' command.
 */

/**
 * Uses the `image-size` library to extract image properties.
 * @param {string} filepath - The path to the image file.
 * @returns {Promise<imageData>} A promise that resolves to the image metadata.
 */
const handleImage = async (filepath) => {
  const size = imageSize(readFileSync(filepath));
  return {
    width: size.width,
    height: size.height,
    orientation: size.width > size.height ? "landscape" : "portrait",
    type: size.type,
  };
};

/**
 * Uses `ffprobe` to extract metadata for video and audio files.
 * @param {string} filepath - The path to the media file.
 * @returns {Promise<mediaData>} A promise that resolves to the media metadata.
 */
const handleMedia = async (filepath) => {
  const command = `ffprobe -v error -show_format -show_streams -of json "${filepath}"`;
  const { stdout } = await execPromise(command);
  const data = JSON.parse(stdout);
  const format = data.format;
  const videoStream = data.streams.find((stream) => stream.codec_type === "video");
  const audioStream = data.streams.find((stream) => stream.codec_type === "audio");
  const result = {
    duration: format.duration ? parseFloat(format.duration) : null,
    container: format.format_name,
  };
  if (videoStream) {
    result.video = { width: videoStream.width, height: videoStream.height, codec: videoStream.codec_name };
  }
  if (audioStream) {
    result.audio = {
      codec: audioStream.codec_name,
      bit_rate: audioStream.bit_rate ? parseInt(audioStream.bit_rate, 10) : null,
    };
  }
  return result;
};

/**
 * Parses a PDF file to extract page count and document info.
 * @param {string} filepath - The path to the PDF file.
 * @returns {Promise<pdfData>} A promise that resolves to the PDF metadata.
 */
const handlePdf = async (filepath) => {
  const buffer = readFileSync(filepath);
  const data = await pdfParse(buffer);
  return { pages: data.numpages, info: data.info };
};

/**
 * Parses an Excel file to get the number of sheets and their names.
 * @param {string} filepath - The path to the Excel file.
 * @returns {Promise<excelData>} A promise that resolves to the Excel metadata.
 */
const handleExcel = async (filepath) => {
  const workbook = XLSX.readFile(filepath);
  return { sheets: workbook.SheetNames.length, sheetNames: workbook.SheetNames };
};

/**
 * Parses a PowerPoint file to get the number of slides.
 * @param {string} filepath - The path to the PowerPoint file.
 * @returns {Promise<powerPointData>} A promise that resolves to the PowerPoint metadata.
 */
const handlePowerPoint = async (filepath) => {
  const { slides } = await officeparser.parse(filepath);
  return { slidesCount: slides && slides.length > 0 ? slides.length : null };
};

/**
 * Extracts raw text from a Word document to get word and character counts.
 * @param {string} filepath - The path to the Word file.
 * @returns {Promise<wordData>} A promise that resolves to the Word document metadata.
 */
const handleWord = async (filepath) => {
  const { value } = await mammoth.extractRawText({ path: filepath });
  const wordCount = value.split(/\s+/).filter((word) => word.length > 0).length;
  const charCount = value.length;
  return { wordCount, charCount };
};

/**
 * Reads a text file and extracts basic statistics.
 * @param {string} filepath - The path to the text file.
 * @returns {Promise<textData>} A promise that resolves to the text file metadata.
 */
const handleText = async (filepath) => {
  const content = await fs.readFile(filepath, "utf-8");
  const lines = content.split("\n");
  return {
    lines: lines.length,
    charCount: content.length,
    firstLine: lines[0].substring(0, 100),
  };
};

/**
 * A fallback handler that uses the system's 'file' command to get generic information.
 * @param {string} filepath - The path to the binary file.
 * @returns {Promise<binaryData>} A promise that resolves to the file's platform info.
 */
const handleBinary = async (filepath) => {
  try {
    const { stdout } = await execPromise(`file "${filepath}"`);
    const platformInfo = stdout.split(": ")[1].trim();
    return { platformInfo };
  } catch (error) {
    console.warn(
      `'file' command failed to run. Check if it's installed and in your PATH.`,
    );
    return { platformInfo: "unknown" };
  }
};

/**
 * A map of MIME type prefixes to their corresponding metadata handler functions.
 * @type {Object<string, Function>}
 */
const handlers = {
  "image/": handleImage,
  "video/": handleMedia,
  "audio/": handleMedia,
  "application/pdf": handlePdf,
  "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": handleExcel,
  "application/vnd.ms-excel": handleExcel,
  "application/vnd.openxmlformats-officedocument.presentationml.presentation": handlePowerPoint,
  "application/vnd.openxmlformats-officedocument.wordprocessingml.document": handleWord,
  "text/": handleText,
};

/**
 * Fetches metadata for a given file. It first tries to determine the file type and then
 * uses the appropriate handler. If no specific handler is found, it uses a fallback handler.
 * @param {string} filepath - The path to the file.
 * @param {string} [mimeType] - Optional MIME type to bypass file type detection.
 * @returns {Promise<object>} A promise that resolves to an object containing the file's metadata.
 */
const getMetadata = async (filepath, mimeType) => {
  let metadata = { mimeType: mimeType || "application/octet-stream" };
  try {
    if (metadata.mimeType === "application/octet-stream") {
      const buffer = await fs.readFile(filepath);
      const fileType = await fileTypeFromBuffer(buffer);
      if (fileType) {
        metadata.mimeType = fileType.mime;
      }
    }
    const handlerKey = Object.keys(handlers).find(
      (key) => metadata.mimeType === key || metadata.mimeType.startsWith(key),
    );
    const handler = handlers[handlerKey];
    if (handler) {
      const handlerMetadata = await handler(filepath);
      metadata = { ...metadata, ...handlerMetadata };
    } else {
      const handlerMetadata = await handleBinary(filepath);
      metadata = { ...metadata, ...handlerMetadata };
    }
  } catch (error) {
    console.warn(`Error extracting metadata for file '${filepath}': ${error.message}`);
    return metadata;
  }
  return metadata;
};

module.exports = getMetadata;