[memo] PDF.js with Typescript

Jan 22, 2023 12:48 · 525 words · 3 minute read

PDF.js is library to handle PDF file on standard web platform.

setup 🔗

The tsconfig.json is as below:

{
  "compilerOptions": {
    /* Language and Environment */
    "target": "ES6",                                  /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
 
    /* Modules */
    "module": "CommonJS",                                /* Specify what module code is generated. */
 
    /* Emit */
    "outDir": "./dist",                                   /* Specify an output folder for all emitted files. */

    /* Interop Constraints */
    "esModuleInterop": true,                             /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */
    "forceConsistentCasingInFileNames": true,            /* Ensure that casing is correct in imports. */

    /* Type Checking */
    "strict": true,                                      /* Enable all strict type-checking options. */

    /* Completeness */
    "skipLibCheck": true                                 /* Skip type checking all .d.ts files. */
  }
}

In this project, put source files in /src and emit files into /dist.

The webpack.config.js is as below:

const path = require('path');
const CleanPlugin = require('clean-webpack-plugin');
const HtmlWebpackPlugin = require('html-webpack-plugin');
const CopyWebpackPlugin = require('copy-webpack-plugin');

module.exports = {
  entry: path.resolve(__dirname, './src/index.ts'),
  output: {
		path: path.resolve(__dirname, 'dist'),
		filename: '[name].js',
	},
  devtool: "source-map",
  devServer: {
		static: {
      directory: path.join(__dirname, 'dist'),
    },
		host: "0.0.0.0",
    port: "8080",
	},
  module: {
    rules: [
      {
        test: /\.ts$/,
        use: 'ts-loader',
        exclude: /node_modules/,
      },
    ],
  },
  resolve: {
    extensions: [".ts", ".js"]
  },
  plugins:[
    new CleanPlugin.CleanWebpackPlugin(),
    new HtmlWebpackPlugin({
			template: './src/index.html',
		}),
    // new WasmPackPlugin({
    //   crateDirectory: path.resolve(__dirname, "src/lib/image-processing")
    // }),
    new CopyWebpackPlugin({
      patterns: [
        {
          from: './node_modules/pdfjs-dist/build/pdf.worker.js',
          to: './main.worker.js',
        },
        {
          from: './node_modules/pdfjs-dist/legacy/build/pdf.worker.js',
          to: './legacy.worker.js',
        },
      ],
    }),
  ]
};

Read a PDF, render it into a canvas element, return as Blob 🔗

Overall code is as below:

import * as pdfjs from "pdfjs-dist";
pdfjs.GlobalWorkerOptions.workerSrc = "legacy.worker.js";
// pdfjsLib.GlobalWorkerOptions.workerSrc = `https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.1.266/pdf.worker.js`;

async function renderPDFToCanvas(srcURL: string) {
  const pdf = await pdfjs.getDocument(srcURL).promise;
  const canvases: HTMLCanvasElement[] = [];
  for (let i = 1; i <= pdf.numPages; i++) {
    canvases.push(await renderOnePDFPageToCanvas(await pdf.getPage(i)))
  }
  return canvases
};

async function renderOnePDFPageToCanvas(page: pdfjs.PDFPageProxy) {
  const scale = 1;
  const viewport = page.getViewport({ scale });
  const canvas = <HTMLCanvasElement> document.createElement("canvas");
  const context = canvas.getContext("2d")!;
  canvas.height = viewport.height;
  canvas.width = viewport.width;

  // Render the page into the `<canvas>` element.
  const renderContext = {
    canvasContext: context,
    viewport: viewport,
  };
  await page.render(renderContext).promise;
  return canvas
}

export async function openPDFAsPngBlob(srcURL = "https://raw.githubusercontent.com/mozilla/pdf.js/ba2edeae/examples/learning/helloworld.pdf") {
  const canvases = await renderPDFToCanvas(srcURL);
  var data: Blob[] = [];
  canvases.forEach(canvas => data.push(dataURIToBlob(canvas.toDataURL('image/png'))))
  return data
};


function dataURIToBlob(dataURI: string): Blob {
  var splitted = dataURI.split(',');
  if (splitted.length != 2) {
    throw new URIError("invalid data uri")
  }
  var data = splitted[1];
  var mime = splitted[0].replace(/^data:/, '');
  if (mime.match(/;base64/)) {
    mime = mime.replace(/;base64/, '');
    data = window.atob(data);
  }
  var buf = new Uint8Array(data.length);
  for (var i = 0; i < data.length; i++) {
    buf[i] = data.charCodeAt(i);
  }
  return new Blob(
    [buf],
    { type: mime },
  )
}

pdfjs.GlobalWorkerOptions.workerSrc = “legacy.worker.js”;

This code specifies worker source of PDF.js.
It is included in pdfjs-dist package.
This is why CopyWebpackPlugin in webpack config is required.

dataURIToBlob

First I thought canvas.toBlog is available, but the callback is called after the Blog is returned.
So this is workaround.

await page.render(renderContext).promise

This is the point where I stacked. I wrote await page.render(renderContext); and got empty data.

References 🔗

web