// source/parsers/pdf.ts
// The text extracter for PDF files.
// @ts-expect-error There are no types for this package.
import parsePdf from 'pdf-parse/lib/pdf-parse.js';
export class PdfExtractor {
    constructor() {
        /**
         * The type(s) of input acceptable to this method.
         */
        this.mimes = ['application/pdf'];
        /**
         * Extract text from a PDF file if possible.
         *
         * @param payload The input and its type.
         * @returns The text extracted from the input.
         */
        this.apply = async (input) => {
            // Convert the PDF to text and return the text.
            const parsedPdf = (await parsePdf(input, {
                pagerender: renderPage,
            }));
            return parsedPdf.text;
        };
    }
}
/**
 * We have to redefine this function to ensure that there are spaces between
 * words in the output text.
 *
 * @param data The data stored in the PDF about the page.
 * @returns The text content on the page
 */
const renderPage = async (data) => {
    const options = {
        normalizeWhitespace: false,
        disableCombineTextItems: false,
    };
    // @ts-expect-error todo: figure out the types
    return data.getTextContent(options).then((textContent) => {
        let lastY = '';
        let text = '';
        // @ts-expect-error todo: figure out the types
        for (const item of textContent.items) {
            if (!(lastY === item.transform[5] || !lastY))
                text += '\n';
            // The word + a space
            text += item.str + ' ';
            lastY = item.transform[5];
        }
        return text;
    });
};
