diff --git a/src/main/java/com/mindee/image/ImageExtractor.java b/src/main/java/com/mindee/image/ImageExtractor.java index 3309e160a..e133eda7e 100644 --- a/src/main/java/com/mindee/image/ImageExtractor.java +++ b/src/main/java/com/mindee/image/ImageExtractor.java @@ -4,15 +4,17 @@ import com.mindee.geometry.PositionDataField; import com.mindee.input.InputSourceUtils; import com.mindee.input.LocalInputSource; -import com.mindee.pdf.PDFBoxApi; -import com.mindee.pdf.PDFOperation; -import com.mindee.pdf.PdfPageImage; import java.awt.image.BufferedImage; import java.io.ByteArrayInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.List; import javax.imageio.ImageIO; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.rendering.ImageType; +import org.apache.pdfbox.rendering.PDFRenderer; /** * Extract sub-images from an image. @@ -22,14 +24,14 @@ public class ImageExtractor { private final String filename; private final String saveFormat; - public ImageExtractor(LocalInputSource source, PDFOperation pdfOperation) throws IOException { + public ImageExtractor(LocalInputSource source) throws IOException { this.filename = source.getFilename(); this.pageImages = new ArrayList<>(); - if (source.isPdf()) { + if (source.isPDF()) { this.saveFormat = "jpg"; - var pdfPageImages = pdfOperation.pdfToImages(source); - for (PdfPageImage pdfPageImage : pdfPageImages) { + var pdfPageImages = pdfToImages(source.getFile(), this.filename); + for (PDFPageImage pdfPageImage : pdfPageImages) { this.pageImages.add(pdfPageImage.getImage()); } } else { @@ -41,14 +43,34 @@ public ImageExtractor(LocalInputSource source, PDFOperation pdfOperation) throws } } - /** - * Init from a {@link LocalInputSource}. - * - * @param source The local source. - * @throws IOException Throws if the file can't be accessed. - */ - public ImageExtractor(LocalInputSource source) throws IOException { - this(source, new PDFBoxApi()); + public List pdfToImages(byte[] fileBytes, String filename) throws IOException { + PDDocument document = Loader.loadPDF(fileBytes); + var pdfRenderer = new PDFRenderer(document); + List pdfPageImages = new ArrayList<>(); + for (int i = 0; i < document.getNumberOfPages(); i++) { + var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer); + pdfPageImages.add(new PDFPageImage(imageBuffer, i, filename, "jpg")); + } + document.close(); + return pdfPageImages; + } + + private BufferedImage pdfPageToImageBuffer( + int index, + PDDocument document, + PDFRenderer pdfRenderer + ) throws IOException { + PDRectangle bbox = document.getPage(index).getBBox(); + float dimension = bbox.getWidth() * bbox.getHeight(); + int dpi; + if (dimension < 200000) { + dpi = 300; + } else if (dimension < 300000) { + dpi = 250; + } else { + dpi = 200; + } + return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB); } /** diff --git a/src/main/java/com/mindee/pdf/PdfPageImage.java b/src/main/java/com/mindee/image/PDFPageImage.java similarity index 96% rename from src/main/java/com/mindee/pdf/PdfPageImage.java rename to src/main/java/com/mindee/image/PDFPageImage.java index 9ef2e54d7..fe9493b36 100644 --- a/src/main/java/com/mindee/pdf/PdfPageImage.java +++ b/src/main/java/com/mindee/image/PDFPageImage.java @@ -1,4 +1,4 @@ -package com.mindee.pdf; +package com.mindee.image; import com.mindee.MindeeException; import com.mindee.input.InputSourceUtils; @@ -16,13 +16,13 @@ * A page in a PDF extracted as an image. */ @Getter -public class PdfPageImage { +public class PDFPageImage { private final BufferedImage image; private final int originalIndex; private final String saveFormat; private final String originalFilename; - public PdfPageImage( + public PDFPageImage( BufferedImage image, int originalIndex, String originalFilename, diff --git a/src/main/java/com/mindee/input/InputSourceUtils.java b/src/main/java/com/mindee/input/InputSourceUtils.java index 076320b56..f6d86877d 100644 --- a/src/main/java/com/mindee/input/InputSourceUtils.java +++ b/src/main/java/com/mindee/input/InputSourceUtils.java @@ -1,12 +1,6 @@ package com.mindee.input; import com.mindee.MindeeException; -import java.io.ByteArrayInputStream; -import java.io.IOException; -import org.apache.pdfbox.Loader; -import org.apache.pdfbox.io.RandomAccessReadBuffer; -import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.text.PDFTextStripper; /** * Utilities for working with files. @@ -65,46 +59,4 @@ public static String[] splitNameStrict(String filename) throws MindeeException { } return new String[] { name, extension }; } - - /** - * Returns true if the file is a PDF. - */ - public static boolean isPdf(byte[] fileBytes) { - try { - Loader.loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes))); - } catch (IOException e) { - return false; - } - return true; - } - - /** - * Returns true if the source PDF has source text inside. Returns false for images. - * - * @param fileBytes A byte array representing a PDF. - * @return True if at least one character exists in one page. - * @throws MindeeException if the file could not be read. - */ - public static boolean hasSourceText(byte[] fileBytes) { - try { - PDDocument document = Loader - .loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes))); - PDFTextStripper stripper = new PDFTextStripper(); - - for (int i = 0; i < document.getNumberOfPages(); i++) { - stripper.setStartPage(i + 1); - stripper.setEndPage(i + 1); - String pageText = stripper.getText(document); - if (!pageText.trim().isEmpty()) { - document.close(); - return true; - } - } - document.close(); - } catch (IOException e) { - return false; - } - - return false; - } } diff --git a/src/main/java/com/mindee/input/LocalInputSource.java b/src/main/java/com/mindee/input/LocalInputSource.java index 32beff36b..073cb046f 100644 --- a/src/main/java/com/mindee/input/LocalInputSource.java +++ b/src/main/java/com/mindee/input/LocalInputSource.java @@ -1,9 +1,10 @@ package com.mindee.input; import com.mindee.image.ImageCompressor; -import com.mindee.pdf.PDFBoxApi; +import com.mindee.pdf.PDFCompression; import com.mindee.pdf.PDFCompressor; -import com.mindee.pdf.PDFOperation; +import com.mindee.pdf.PDFInputOperation; +import com.mindee.pdf.PDFInputOperator; import java.io.File; import java.io.IOException; import java.io.InputStream; @@ -17,14 +18,18 @@ /** * A source document for Mindee API operations. */ -public final class LocalInputSource { +public class LocalInputSource { @Getter private byte[] file; @Getter private final String filename; @Setter - private PDFOperation pdfOperation; + private PDFInputOperation pdfInputOperator; + @Setter + private PDFCompression pdfCompressor; + // Store here to avoid recalculating every time. + private Boolean isPDF; public LocalInputSource(InputStream file, String filename) throws IOException { this.file = IOUtils.toByteArray(file); @@ -57,11 +62,18 @@ public LocalInputSource(String fileAsBase64, String filename) { this.filename = filename; } - public PDFOperation getPdfOperation() { - if (this.pdfOperation == null) { - this.pdfOperation = new PDFBoxApi(); + private PDFInputOperation getPDFInputOperator() { + if (this.pdfInputOperator == null) { + this.pdfInputOperator = new PDFInputOperator(); + } + return this.pdfInputOperator; + } + + private PDFCompression getPDFCompressor() { + if (this.pdfCompressor == null) { + this.pdfCompressor = new PDFCompressor(); } - return this.pdfOperation; + return this.pdfCompressor; } /** @@ -71,10 +83,10 @@ public PDFOperation getPdfOperation() { * @throws IOException If an I/O error occurs during the PDF operation. */ public int getPageCount() throws IOException { - if (!this.isPdf()) { + if (!this.isPDF()) { return 1; } - return getPdfOperation().getNumberOfPages(this.file); + return getPDFInputOperator().getPageCount(this.file); } /** @@ -84,35 +96,38 @@ public int getPageCount() throws IOException { * @throws IOException If an I/O error occurs during the PDF operation. */ public void applyPageOptions(PageOptions pageOptions) throws IOException { - if (pageOptions != null && this.isPdf()) { - this.file = getPdfOperation().split(this.file, pageOptions).getFile(); + if (pageOptions != null && this.isPDF()) { + this.file = getPDFInputOperator().split(this.file, pageOptions).getFile(); } } - public boolean isPdf() { - return InputSourceUtils.isPdf(this.file); - } - - public boolean hasSourceText() { - return InputSourceUtils.hasSourceText(this.file); + /** + * Returns true if the file is a PDF. + */ + public boolean isPDF() { + if (this.isPDF == null) { + this.isPDF = getPDFInputOperator().isPDF(this.file); + } + return this.isPDF; } public void compress( - Integer quality, + int quality, Integer maxWidth, Integer maxHeight, Boolean forceSourceText, Boolean disableSourceText ) throws IOException { - if (isPdf()) { - this.file = PDFCompressor.compressPdf(this.file, quality, forceSourceText, disableSourceText); + if (isPDF()) { + this.file = getPDFCompressor() + .compressPDF(this.file, quality, forceSourceText, disableSourceText); } else { this.file = ImageCompressor.compressImage(this.file, quality, maxWidth, maxHeight); } } public void compress( - Integer quality, + int quality, Integer maxWidth, Integer maxHeight, Boolean forceSourceText @@ -120,15 +135,19 @@ public void compress( this.compress(quality, maxWidth, maxHeight, forceSourceText, true); } - public void compress(Integer quality, Integer maxWidth, Integer maxHeight) throws IOException { - this.compress(quality, maxWidth, maxHeight, false, true); + public void compress( + int quality, + boolean forceSourceText, + boolean disableSourceText + ) throws IOException { + this.compress(quality, null, null, forceSourceText, disableSourceText); } - public void compress(Integer quality, Integer maxWidth) throws IOException { - this.compress(quality, maxWidth, null, false, true); + public void compress(int quality, Integer maxWidth, Integer maxHeight) throws IOException { + this.compress(quality, maxWidth, maxHeight, false, true); } - public void compress(Integer quality) throws IOException { + public void compress(int quality) throws IOException { this.compress(quality, null, null, false, true); } diff --git a/src/main/java/com/mindee/pdf/BasePDFExtractor.java b/src/main/java/com/mindee/pdf/BasePDFExtractor.java index f9b6b21a8..d7263d803 100644 --- a/src/main/java/com/mindee/pdf/BasePDFExtractor.java +++ b/src/main/java/com/mindee/pdf/BasePDFExtractor.java @@ -6,7 +6,6 @@ import java.awt.image.BufferedImage; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; -import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -33,9 +32,9 @@ public class BasePDFExtractor { * @param source The local source. * @throws IOException Throws if the file can't be accessed. */ - protected BasePDFExtractor(LocalInputSource source) throws IOException { + public BasePDFExtractor(LocalInputSource source) throws IOException { this.filename = source.getFilename(); - if (source.isPdf()) { + if (source.isPDF()) { this.sourcePdf = Loader.loadPDF(source.getFile()); } else { var document = new PDDocument(); @@ -57,15 +56,6 @@ protected BasePDFExtractor(LocalInputSource source) throws IOException { } } - /** - * Get the number of pages in the PDF file. - * - * @return The number of pages in the PDF file. - */ - public int getPageCount() { - return sourcePdf.getNumberOfPages(); - } - /** * Converts an array to a buffered image. * @@ -106,10 +96,7 @@ public List extractSubDocuments( + splitName[1]; extractedPDFs .add( - new ExtractedPDF( - Loader.loadPDF(mergePdfPages(this.sourcePdf, pageIndexElement, false)), - fieldFilename - ) + new ExtractedPDF(mergePdfPages(this.sourcePdf, pageIndexElement, false), fieldFilename) ); } return extractedPDFs; @@ -149,25 +136,7 @@ private static byte[] createPdfFromExistingPdf( return output; } - /** - * Merge specified PDF pages together. - * - * @param file The PDF file. - * @param pageNumbers Lit of page numbers to merge together. - */ - public static byte[] mergePdfPages(File file, List pageNumbers) throws IOException { - PDDocument document = Loader.loadPDF(file); - return createPdfFromExistingPdf(document, pageNumbers, true); - } - - public static byte[] mergePdfPages( - PDDocument document, - List pageNumbers - ) throws IOException { - return mergePdfPages(document, pageNumbers, true); - } - - public static byte[] mergePdfPages( + public byte[] mergePdfPages( PDDocument document, List pageNumbers, boolean closeOriginal diff --git a/src/main/java/com/mindee/pdf/ExtractedPDF.java b/src/main/java/com/mindee/pdf/ExtractedPDF.java index 7bfe24c95..8b9c7c256 100644 --- a/src/main/java/com/mindee/pdf/ExtractedPDF.java +++ b/src/main/java/com/mindee/pdf/ExtractedPDF.java @@ -1,29 +1,27 @@ package com.mindee.pdf; import com.mindee.input.LocalInputSource; -import java.io.ByteArrayOutputStream; -import java.io.File; import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Paths; import lombok.Getter; -import org.apache.pdfbox.pdmodel.PDDocument; /** * An extracted sub-PDF. */ @Getter public class ExtractedPDF { - private final PDDocument pdf; + private final byte[] fileBytes; private final String filename; /** * Default constructor. * - * @param pdf PDF wrapper object. + * @param fileBytes PDF file as bytes. * @param filename Name of the extracted file. */ - public ExtractedPDF(PDDocument pdf, String filename) { - this.pdf = pdf; + public ExtractedPDF(byte[] fileBytes, String filename) { + this.fileBytes = fileBytes; this.filename = filename; } @@ -35,8 +33,7 @@ public ExtractedPDF(PDDocument pdf, String filename) { */ public void writeToFile(String outputPath) throws IOException { var pdfPath = Paths.get(outputPath, this.filename); - var outputfile = new File(pdfPath.toString()); - this.pdf.save(outputfile); + Files.write(pdfPath, this.fileBytes); } /** @@ -46,8 +43,6 @@ public void writeToFile(String outputPath) throws IOException { * @throws IOException Throws if the file can't be accessed. */ public LocalInputSource asInputSource() throws IOException { - var output = new ByteArrayOutputStream(); - this.pdf.save(output); - return new LocalInputSource(output.toByteArray(), this.filename); + return new LocalInputSource(this.fileBytes, this.filename); } } diff --git a/src/main/java/com/mindee/pdf/PDFCompression.java b/src/main/java/com/mindee/pdf/PDFCompression.java new file mode 100644 index 000000000..e8619014d --- /dev/null +++ b/src/main/java/com/mindee/pdf/PDFCompression.java @@ -0,0 +1,28 @@ +package com.mindee.pdf; + +import java.io.IOException; + +public interface PDFCompression { + byte[] compressPDF( + byte[] fileBytes, + Integer imageQuality, + Boolean forceSourceTextCompression, + Boolean disableSourceText + ) throws IOException; + + default byte[] compressPDF( + byte[] fileBytes, + Integer imageQuality, + Boolean forceSourceTextCompression + ) throws IOException { + return compressPDF(fileBytes, imageQuality, forceSourceTextCompression, true); + } + + default byte[] compressPDF(byte[] fileBytes, Integer imageQuality) throws IOException { + return compressPDF(fileBytes, imageQuality, false, true); + } + + default byte[] compressPDF(byte[] fileBytes) throws IOException { + return compressPDF(fileBytes, 85, false, true); + } +} diff --git a/src/main/java/com/mindee/pdf/PDFCompressor.java b/src/main/java/com/mindee/pdf/PDFCompressor.java index ccf498eeb..0932f1973 100644 --- a/src/main/java/com/mindee/pdf/PDFCompressor.java +++ b/src/main/java/com/mindee/pdf/PDFCompressor.java @@ -1,14 +1,14 @@ package com.mindee.pdf; -import static com.mindee.input.InputSourceUtils.hasSourceText; -import static com.mindee.input.InputSourceUtils.isPdf; - +import com.mindee.MindeeException; import java.awt.*; import java.awt.image.BufferedImage; +import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.List; import org.apache.pdfbox.Loader; +import org.apache.pdfbox.io.RandomAccessReadBuffer; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPageContentStream; @@ -26,15 +26,22 @@ /** * PDF compression class. */ -public class PDFCompressor { - public static byte[] compressPdf( - byte[] pdfData, +public class PDFCompressor implements PDFCompression { + private final PDFInputOperator pdfInputOperator; + + public PDFCompressor() { + this.pdfInputOperator = new PDFInputOperator(); + } + + @Override + public byte[] compressPDF( + byte[] fileBytes, Integer imageQuality, Boolean forceSourceTextCompression, Boolean disableSourceText ) throws IOException { - if (!isPdf(pdfData)) { - return pdfData; + if (!pdfInputOperator.isPDF(fileBytes)) { + return fileBytes; } if (forceSourceTextCompression == null) { @@ -43,14 +50,14 @@ public static byte[] compressPdf( if (disableSourceText == null) { disableSourceText = true; } - if (!forceSourceTextCompression && hasSourceText(pdfData)) { + if (!forceSourceTextCompression && hasSourceText(fileBytes)) { System.out .println( "MINDEE WARNING: Found text inside of the provided PDF file. Compression operation aborted." ); - return pdfData; + return fileBytes; } - try (PDDocument inputDoc = Loader.loadPDF(pdfData); PDDocument outputDoc = new PDDocument()) { + try (var inputDoc = Loader.loadPDF(fileBytes); PDDocument outputDoc = new PDDocument()) { var pdfRenderer = new PDFRenderer(inputDoc); @@ -75,20 +82,33 @@ public static byte[] compressPdf( } } - public static byte[] compressPdf( - byte[] pdfData, - Integer imageQuality, - Boolean forceSourceTextCompression - ) throws IOException { - return compressPdf(pdfData, imageQuality, forceSourceTextCompression, true); - } - - public static byte[] compressPdf(byte[] pdfData, Integer imageQuality) throws IOException { - return compressPdf(pdfData, imageQuality, false, true); - } - - public static byte[] compressPdf(byte[] pdfData) throws IOException { - return compressPdf(pdfData, 85, false, true); + /** + * Returns true if the source PDF has source text inside. Returns false for images. + * + * @param fileBytes A byte array representing a PDF. + * @return True if at least one character exists in one page. + * @throws MindeeException if the file could not be read. + */ + private boolean hasSourceText(byte[] fileBytes) { + try { + PDDocument document = Loader + .loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes))); + var stripper = new PDFTextStripper(); + + for (int i = 0; i < document.getNumberOfPages(); i++) { + stripper.setStartPage(i + 1); + stripper.setEndPage(i + 1); + String pageText = stripper.getText(document); + if (!pageText.trim().isEmpty()) { + document.close(); + return true; + } + } + document.close(); + } catch (IOException e) { + return false; + } + return false; } private static byte[] documentToBytes(PDDocument document) throws IOException { @@ -134,9 +154,9 @@ protected void writeString(String text, List textPositions) throws return; } - TextPosition firstPosition = textPositions.get(0); + var firstPosition = textPositions.get(0); float fontSize = firstPosition.getFontSizeInPt(); - PDColor color = getGraphicsState().getNonStrokingColor(); + var color = getGraphicsState().getNonStrokingColor(); contentStream.beginText(); contentStream.setFont(firstPosition.getFont(), fontSize); contentStream.setNonStrokingColor(convertToAwtColor(color)); diff --git a/src/main/java/com/mindee/pdf/PDFInputOperation.java b/src/main/java/com/mindee/pdf/PDFInputOperation.java new file mode 100644 index 000000000..a69cc50af --- /dev/null +++ b/src/main/java/com/mindee/pdf/PDFInputOperation.java @@ -0,0 +1,28 @@ +package com.mindee.pdf; + +import com.mindee.input.PageOptions; +import java.io.IOException; + +public interface PDFInputOperation { + + /** + * Split a PDF file. + * + * @param fileBytes A byte array representing a PDF. + */ + SplitPDF split(byte[] fileBytes, PageOptions pageOptions) throws IOException; + + /** + * Get the number of pages in a PDF file. + * + * @param fileBytes A byte array representing a PDF. + */ + int getPageCount(byte[] fileBytes) throws IOException; + + /** + * Returns true if the file is a PDF. + * + * @param fileBytes A byte array representing a PDF. + */ + boolean isPDF(byte[] fileBytes); +} diff --git a/src/main/java/com/mindee/pdf/PDFBoxApi.java b/src/main/java/com/mindee/pdf/PDFInputOperator.java similarity index 54% rename from src/main/java/com/mindee/pdf/PDFBoxApi.java rename to src/main/java/com/mindee/pdf/PDFInputOperator.java index 6afcb8c69..e5657fcfa 100644 --- a/src/main/java/com/mindee/pdf/PDFBoxApi.java +++ b/src/main/java/com/mindee/pdf/PDFInputOperator.java @@ -2,7 +2,7 @@ import com.mindee.MindeeException; import com.mindee.input.PageOptions; -import java.awt.image.BufferedImage; +import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.util.ArrayList; @@ -13,26 +13,24 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; import org.apache.pdfbox.Loader; +import org.apache.pdfbox.io.RandomAccessReadBuffer; import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.common.PDRectangle; -import org.apache.pdfbox.rendering.ImageType; -import org.apache.pdfbox.rendering.PDFRenderer; /** * Allows performing various operations on PDFs. */ -public final class PDFBoxApi implements PDFOperation { +public final class PDFInputOperator implements PDFInputOperation { @Override public SplitPDF split(byte[] fileBytes, PageOptions pageOptions) throws IOException { - if (!checkPdfOpen(fileBytes)) { + if (!isPDFOpen(fileBytes)) { throw new MindeeException("This document cannot be open and cannot be split."); } try (var originalDocument = Loader.loadPDF(fileBytes)) { try (var splitDocument = new PDDocument()) { - int totalOriginalPages = getNumberOfPages(fileBytes); + int totalOriginalPages = getPageCount(fileBytes); if (totalOriginalPages < pageOptions.getOnMinPages()) { return new SplitPDF(fileBytes, totalOriginalPages); @@ -47,63 +45,31 @@ public SplitPDF split(byte[] fileBytes, PageOptions pageOptions) throws IOExcept try (ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) { splitDocument.save(outputStream); byte[] splitPdf = outputStream.toByteArray(); - return new SplitPDF(splitPdf, getNumberOfPages(splitPdf)); + return new SplitPDF(splitPdf, getPageCount(splitPdf)); } } } } @Override - public int getNumberOfPages(byte[] fileBytes) throws IOException { + public int getPageCount(byte[] fileBytes) throws IOException { var document = Loader.loadPDF(fileBytes); int pageCount = document.getNumberOfPages(); document.close(); return pageCount; } + /** + * Returns true if the file is a PDF. + */ @Override - public PdfPageImage pdfPageToImage( - byte[] fileBytes, - String filename, - int pageNumber - ) throws IOException { - int index = pageNumber - 1; - PDDocument document = Loader.loadPDF(fileBytes); - var pdfRenderer = new PDFRenderer(document); - BufferedImage imageBuffer = pdfPageToImageBuffer(index, document, pdfRenderer); - document.close(); - return new PdfPageImage(imageBuffer, index, filename, "jpg"); - } - - @Override - public List pdfToImages(byte[] fileBytes, String filename) throws IOException { - PDDocument document = Loader.loadPDF(fileBytes); - var pdfRenderer = new PDFRenderer(document); - List pdfPageImages = new ArrayList<>(); - for (int i = 0; i < document.getNumberOfPages(); i++) { - var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer); - pdfPageImages.add(new PdfPageImage(imageBuffer, i, filename, "jpg")); - } - document.close(); - return pdfPageImages; - } - - private BufferedImage pdfPageToImageBuffer( - int index, - PDDocument document, - PDFRenderer pdfRenderer - ) throws IOException { - PDRectangle bbox = document.getPage(index).getBBox(); - float dimension = bbox.getWidth() * bbox.getHeight(); - int dpi; - if (dimension < 200000) { - dpi = 300; - } else if (dimension < 300000) { - dpi = 250; - } else { - dpi = 200; + public boolean isPDF(byte[] fileBytes) { + try { + Loader.loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes))); + } catch (IOException e) { + return false; } - return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB); + return true; } private List getPageRanges(PageOptions pageOptions, Integer numberOfPages) { @@ -128,10 +94,10 @@ private List getPageRanges(PageOptions pageOptions, Integer numberOfPag } } - private boolean checkPdfOpen(byte[] documentFile) { + private boolean isPDFOpen(byte[] fileBytes) { boolean opens = false; try { - Loader.loadPDF(documentFile).close(); + Loader.loadPDF(fileBytes).close(); opens = true; } catch (IOException e) { e.printStackTrace(); diff --git a/src/main/java/com/mindee/pdf/PDFOperation.java b/src/main/java/com/mindee/pdf/PDFOperation.java deleted file mode 100644 index e9c0af58a..000000000 --- a/src/main/java/com/mindee/pdf/PDFOperation.java +++ /dev/null @@ -1,44 +0,0 @@ -package com.mindee.pdf; - -import com.mindee.input.LocalInputSource; -import com.mindee.input.PageOptions; -import java.io.IOException; -import java.util.List; - -/** - * Minimum PDF operations. - */ -public interface PDFOperation { - - /** - * Split a PDF file. - */ - SplitPDF split(byte[] fileBytes, PageOptions pageOptions) throws IOException; - - /** - * Get the number of pages in a PDF file. - */ - int getNumberOfPages(byte[] fileBytes) throws IOException; - - default int getNumberOfPages(LocalInputSource inputSource) throws IOException { - return getNumberOfPages(inputSource.getFile()); - } - - /** - * Render a single page of a PDF as an image. - */ - PdfPageImage pdfPageToImage(byte[] fileBytes, String filename, int pageNumber) throws IOException; - - default PdfPageImage pdfPageToImage(LocalInputSource source, int pageNumber) throws IOException { - return pdfPageToImage(source.getFile(), source.getFilename(), pageNumber); - } - - /** - * Render all pages of a PDF as images. - */ - List pdfToImages(byte[] fileBytes, String filename) throws IOException; - - default List pdfToImages(LocalInputSource source) throws IOException { - return pdfToImages(source.getFile(), source.getFilename()); - } -} diff --git a/src/main/java/com/mindee/v1/MindeeClient.java b/src/main/java/com/mindee/v1/MindeeClient.java index cd167d2b1..67a20cf9b 100644 --- a/src/main/java/com/mindee/v1/MindeeClient.java +++ b/src/main/java/com/mindee/v1/MindeeClient.java @@ -4,8 +4,8 @@ import com.mindee.input.LocalInputSource; import com.mindee.input.PageOptions; import com.mindee.input.URLInputSource; -import com.mindee.pdf.PDFBoxApi; -import com.mindee.pdf.PDFOperation; +import com.mindee.pdf.PDFInputOperation; +import com.mindee.pdf.PDFInputOperator; import com.mindee.v1.clientOptions.PollingOptions; import com.mindee.v1.clientOptions.PredictOptions; import com.mindee.v1.clientOptions.WorkflowOptions; @@ -26,7 +26,7 @@ */ public class MindeeClient { - protected PDFOperation pdfOperation; + protected PDFInputOperation pdfOperation; private final MindeeApiV1 mindeeApi; /** @@ -34,7 +34,7 @@ public class MindeeClient { * You'll need to set the API key in the environment for this approach to work properly. */ public MindeeClient() { - this.pdfOperation = new PDFBoxApi(); + this.pdfOperation = new PDFInputOperator(); this.mindeeApi = createDefaultApi(""); } @@ -44,7 +44,7 @@ public MindeeClient() { * @param apiKey The api key to use. */ public MindeeClient(String apiKey) { - this.pdfOperation = new PDFBoxApi(); + this.pdfOperation = new PDFInputOperator(); this.mindeeApi = createDefaultApi(apiKey); } @@ -54,7 +54,7 @@ public MindeeClient(String apiKey) { * @param mindeeApi The MindeeApi implementation to be used by the created MindeeClient. */ public MindeeClient(MindeeApiV1 mindeeApi) { - this.pdfOperation = new PDFBoxApi(); + this.pdfOperation = new PDFInputOperator(); this.mindeeApi = mindeeApi; } @@ -64,7 +64,7 @@ public MindeeClient(MindeeApiV1 mindeeApi) { * @param pdfOperation The PdfOperation implementation to be used by the created MindeeClient. * @param mindeeApi The MindeeApi implementation to be used by the created MindeeClient. */ - public MindeeClient(PDFOperation pdfOperation, MindeeApiV1 mindeeApi) { + public MindeeClient(PDFInputOperation pdfOperation, MindeeApiV1 mindeeApi) { this.pdfOperation = pdfOperation; this.mindeeApi = mindeeApi; } @@ -127,13 +127,8 @@ protected byte[] getSplitFile( LocalInputSource localInputSource, PageOptions pageOptions ) throws IOException { - byte[] splitFile; - if (pageOptions == null || !localInputSource.isPdf()) { - splitFile = localInputSource.getFile(); - } else { - splitFile = pdfOperation.split(localInputSource.getFile(), pageOptions).getFile(); - } - return splitFile; + localInputSource.applyPageOptions(pageOptions); + return localInputSource.getFile(); } /** diff --git a/src/test/java/com/mindee/input/FileCompressionTest.java b/src/test/java/com/mindee/input/FileCompressionTest.java index 50b2677cf..0047c8cfa 100644 --- a/src/test/java/com/mindee/input/FileCompressionTest.java +++ b/src/test/java/com/mindee/input/FileCompressionTest.java @@ -14,7 +14,6 @@ import java.util.stream.Collectors; import javax.imageio.ImageIO; import org.apache.pdfbox.Loader; -import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.text.PDFTextStripper; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -251,13 +250,14 @@ public void testPdfResizeFromCompressor() throws IOException { Path outputDir = getResourcePath("output"); Path inputPath = getV1ResourcePath("products/invoice_splitter/default_sample.pdf"); LocalInputSource pdfResizeInput = new LocalInputSource(inputPath.toString()); - + pdfResizeInput.compress(); + var compressor = new PDFCompressor(); List resizes = Arrays .asList( - PDFCompressor.compressPdf(pdfResizeInput.getFile()), - PDFCompressor.compressPdf(pdfResizeInput.getFile(), 75), - PDFCompressor.compressPdf(pdfResizeInput.getFile(), 50), - PDFCompressor.compressPdf(pdfResizeInput.getFile(), 10) + compressor.compressPDF(pdfResizeInput.getFile()), + compressor.compressPDF(pdfResizeInput.getFile(), 75), + compressor.compressPDF(pdfResizeInput.getFile(), 50), + compressor.compressPDF(pdfResizeInput.getFile(), 10) ); List outputPaths = Arrays @@ -323,11 +323,11 @@ public void testPdfResizeFromCompressor() throws IOException { public void testPdfResizeWithTextKeepsText() throws IOException { Path inputPath = getResourcePath("file_types/pdf/multipage.pdf"); LocalInputSource initialWithText = new LocalInputSource(inputPath.toString()); - byte[] compressedWithText = PDFCompressor - .compressPdf(initialWithText.getFile(), 100, true, false); - PDDocument originalDoc = Loader.loadPDF(initialWithText.getFile()); - PDDocument compressedDoc = Loader.loadPDF(compressedWithText); + var originalDoc = Loader.loadPDF(initialWithText.getFile()); + + initialWithText.compress(100, true, false); + var compressedDoc = Loader.loadPDF(initialWithText.getFile()); Assertions.assertEquals(originalDoc.getNumberOfPages(), compressedDoc.getNumberOfPages()); Assertions.assertNotEquals(originalDoc.hashCode(), compressedDoc.hashCode()); diff --git a/src/test/java/com/mindee/input/LocalInputSourceTest.java b/src/test/java/com/mindee/input/LocalInputSourceTest.java index 6482bf51e..81788e354 100644 --- a/src/test/java/com/mindee/input/LocalInputSourceTest.java +++ b/src/test/java/com/mindee/input/LocalInputSourceTest.java @@ -16,8 +16,7 @@ public class LocalInputSourceTest { void assertMultipagePDF(LocalInputSource inputSource, Path filePath) throws IOException { Assertions.assertNotNull(inputSource); - Assertions.assertTrue(inputSource.isPdf()); - Assertions.assertTrue(inputSource.hasSourceText()); + Assertions.assertTrue(inputSource.isPDF()); Assertions.assertEquals(3, inputSource.getPageCount()); Assertions.assertEquals("multipage_cut-3.pdf", inputSource.getFilename()); Assertions.assertArrayEquals(inputSource.getFile(), Files.readAllBytes(filePath)); @@ -64,15 +63,13 @@ void loadPDF__withoutText_mustNotDetectSourceText() throws MindeeException, IOEx String encodedFile = Base64.encodeBase64String(Files.readAllBytes(filePath)); var localInputSource = new LocalInputSource(encodedFile, "default_sample.pdf"); Assertions.assertNotNull(localInputSource); - Assertions.assertTrue(localInputSource.isPdf()); - Assertions.assertFalse(localInputSource.hasSourceText()); + Assertions.assertTrue(localInputSource.isPDF()); } void assertImage(LocalInputSource inputSource, Path filePath) throws IOException { Assertions.assertNotNull(inputSource); - Assertions.assertFalse(inputSource.isPdf()); - Assertions.assertFalse(inputSource.hasSourceText()); + Assertions.assertFalse(inputSource.isPDF()); Assertions.assertEquals(1, inputSource.getPageCount()); Assertions.assertEquals("receipt.jpg", inputSource.getFilename()); Assertions.assertArrayEquals(inputSource.getFile(), Files.readAllBytes(filePath)); diff --git a/src/test/java/com/mindee/pdf/PDFOperationTest.java b/src/test/java/com/mindee/pdf/PDFOperationTest.java index 9cb08277c..b831d92cd 100644 --- a/src/test/java/com/mindee/pdf/PDFOperationTest.java +++ b/src/test/java/com/mindee/pdf/PDFOperationTest.java @@ -9,7 +9,6 @@ import java.io.File; import java.io.IOException; import java.nio.file.Files; -import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; import java.util.Random; @@ -20,40 +19,7 @@ public class PDFOperationTest { - private final PDFOperation pdfOperation = new PDFBoxApi(); - - @Test - public void shouldConvertSinglePageToJpg() throws IOException { - LocalInputSource source = new LocalInputSource( - "src/test/resources/file_types/pdf/multipage.pdf" - ); - PdfPageImage pdfPageImage = pdfOperation.pdfPageToImage(source, 3); - Assertions.assertNotNull(pdfPageImage.getImage()); - Assertions.assertEquals(pdfPageImage.asInputSource().getFilename(), pdfPageImage.getFilename()); - pdfPageImage.writeToFile("src/test/resources/output/"); - Assertions - .assertTrue( - Files.exists(Paths.get("src/test/resources/output/" + pdfPageImage.getFilename())) - ); - } - - @Test - public void shouldConvertAllPagesToJpg() throws IOException { - LocalInputSource source = new LocalInputSource( - "src/test/resources/file_types/pdf/multipage.pdf" - ); - List pdfPageImages = pdfOperation.pdfToImages(source); - for (PdfPageImage pdfPageImage : pdfPageImages) { - Assertions.assertNotNull(pdfPageImage.getImage()); - Assertions - .assertEquals(pdfPageImage.asInputSource().getFilename(), pdfPageImage.getFilename()); - pdfPageImage.writeToFile("src/test/resources/output/"); - Assertions - .assertTrue( - Files.exists(Paths.get("src/test/resources/output/" + pdfPageImage.getFilename())) - ); - } - } + private final PDFInputOperation pdfOperation = new PDFInputOperator(); @Test public void givenADocument_whenPageCounted_thenReturnsCorrectPageCount() throws IOException { @@ -67,7 +33,7 @@ public void givenADocument_whenPageCounted_thenReturnsCorrectPageCount() throws document.close(); File file = getResourcePath("output/test.pdf").toFile(); LocalInputSource source = new LocalInputSource(file); - Assertions.assertEquals(random, pdfOperation.getNumberOfPages(source)); + Assertions.assertEquals(random, source.getPageCount()); file.delete(); } @@ -124,7 +90,7 @@ public void givenADocumentAndListOfPagesToRemove_whenSplit_thenReturnsOnlyNotRem } @Test - public void givenADocumentOtherThantAPdf_whenSplit_mustFail() throws IOException { + public void givenADocumentOtherThantAPdf_whenSplit_mustFail() { PageOptions pageOptions = new PageOptions.Builder() .pageIndexes(new Integer[] { 1, 2, 3 }) diff --git a/src/test/java/com/mindee/v1/fileOperation/InvoiceSplitterAutoExtractionIT.java b/src/test/java/com/mindee/v1/fileOperation/InvoiceSplitterAutoExtractionIT.java index 794cf7874..dd33c1663 100644 --- a/src/test/java/com/mindee/v1/fileOperation/InvoiceSplitterAutoExtractionIT.java +++ b/src/test/java/com/mindee/v1/fileOperation/InvoiceSplitterAutoExtractionIT.java @@ -68,7 +68,6 @@ public void givenAPDF_shouldExtractInvoices() throws IOException, InterruptedExc InvoiceSplitterV1 inference = document.getInference(); PDFExtractor extractor = new PDFExtractor(invoiceSplitterInputSource); - Assertions.assertEquals(2, extractor.getPageCount()); List extractedPDFsStrict = extractor .extractInvoices(inference.getPrediction().getInvoicePageGroups(), false); Assertions.assertEquals(2, extractedPDFsStrict.size()); diff --git a/src/test/java/com/mindee/v1/pdf/PDFExtractorTest.java b/src/test/java/com/mindee/v1/pdf/PDFExtractorTest.java index e30aada8e..e68a3d52e 100644 --- a/src/test/java/com/mindee/v1/pdf/PDFExtractorTest.java +++ b/src/test/java/com/mindee/v1/pdf/PDFExtractorTest.java @@ -28,7 +28,6 @@ public void givenAPDF_shouldExtractInvoicesNoStrict() throws IOException { InvoiceSplitterV1 inference = response.getDocument().getInference(); PDFExtractor extractor = new PDFExtractor(pdf); - Assertions.assertEquals(5, extractor.getPageCount()); var extractedPDFSNoStrict = extractor .extractInvoices(inference.getPrediction().getInvoicePageGroups(), false); Assertions.assertEquals(3, extractedPDFSNoStrict.size()); @@ -39,14 +38,13 @@ public void givenAPDF_shouldExtractInvoicesNoStrict() throws IOException { @Test public void givenAPDF_shouldExtractInvoicesStrict() throws IOException { - LocalInputSource pdf = new LocalInputSource( + var inputSource = new LocalInputSource( getV1ResourcePath("products/invoice_splitter/invoice_5p.pdf") ); PredictResponse response = getInvoiceSplitterPrediction(); InvoiceSplitterV1 inference = response.getDocument().getInference(); - PDFExtractor extractor = new PDFExtractor(pdf); - Assertions.assertEquals(5, extractor.getPageCount()); + PDFExtractor extractor = new PDFExtractor(inputSource); var extractedPDFStrict = extractor .extractInvoices(inference.getPrediction().getInvoicePageGroups(), true); Assertions.assertEquals(2, extractedPDFStrict.size());