Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 37 additions & 15 deletions src/main/java/com/mindee/image/ImageExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,17 @@
import com.mindee.geometry.PositionDataField;
import com.mindee.input.InputSourceUtils;
import com.mindee.input.LocalInputSource;
import com.mindee.pdf.PDFBoxApi;
import com.mindee.pdf.PDFOperation;
import com.mindee.pdf.PdfPageImage;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import javax.imageio.ImageIO;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;

/**
* Extract sub-images from an image.
Expand All @@ -22,14 +24,14 @@ public class ImageExtractor {
private final String filename;
private final String saveFormat;

public ImageExtractor(LocalInputSource source, PDFOperation pdfOperation) throws IOException {
public ImageExtractor(LocalInputSource source) throws IOException {
this.filename = source.getFilename();
this.pageImages = new ArrayList<>();

if (source.isPdf()) {
if (source.isPDF()) {
this.saveFormat = "jpg";
var pdfPageImages = pdfOperation.pdfToImages(source);
for (PdfPageImage pdfPageImage : pdfPageImages) {
var pdfPageImages = pdfToImages(source.getFile(), this.filename);
for (PDFPageImage pdfPageImage : pdfPageImages) {
this.pageImages.add(pdfPageImage.getImage());
}
} else {
Expand All @@ -41,14 +43,34 @@ public ImageExtractor(LocalInputSource source, PDFOperation pdfOperation) throws
}
}

/**
* Init from a {@link LocalInputSource}.
*
* @param source The local source.
* @throws IOException Throws if the file can't be accessed.
*/
public ImageExtractor(LocalInputSource source) throws IOException {
this(source, new PDFBoxApi());
public List<PDFPageImage> pdfToImages(byte[] fileBytes, String filename) throws IOException {
PDDocument document = Loader.loadPDF(fileBytes);
var pdfRenderer = new PDFRenderer(document);
List<PDFPageImage> pdfPageImages = new ArrayList<>();
for (int i = 0; i < document.getNumberOfPages(); i++) {
var imageBuffer = pdfPageToImageBuffer(i, document, pdfRenderer);
pdfPageImages.add(new PDFPageImage(imageBuffer, i, filename, "jpg"));
}
document.close();
return pdfPageImages;
}

private BufferedImage pdfPageToImageBuffer(
int index,
PDDocument document,
PDFRenderer pdfRenderer
) throws IOException {
PDRectangle bbox = document.getPage(index).getBBox();
float dimension = bbox.getWidth() * bbox.getHeight();
int dpi;
if (dimension < 200000) {
dpi = 300;
} else if (dimension < 300000) {
dpi = 250;
} else {
dpi = 200;
}
return pdfRenderer.renderImageWithDPI(index, dpi, ImageType.RGB);
}

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package com.mindee.pdf;
package com.mindee.image;

import com.mindee.MindeeException;
import com.mindee.input.InputSourceUtils;
Expand All @@ -16,13 +16,13 @@
* A page in a PDF extracted as an image.
*/
@Getter
public class PdfPageImage {
public class PDFPageImage {
private final BufferedImage image;
private final int originalIndex;
private final String saveFormat;
private final String originalFilename;

public PdfPageImage(
public PDFPageImage(
BufferedImage image,
int originalIndex,
String originalFilename,
Expand Down
48 changes: 0 additions & 48 deletions src/main/java/com/mindee/input/InputSourceUtils.java
Original file line number Diff line number Diff line change
@@ -1,12 +1,6 @@
package com.mindee.input;

import com.mindee.MindeeException;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.io.RandomAccessReadBuffer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;

/**
* Utilities for working with files.
Expand Down Expand Up @@ -65,46 +59,4 @@ public static String[] splitNameStrict(String filename) throws MindeeException {
}
return new String[] { name, extension };
}

/**
* Returns true if the file is a PDF.
*/
public static boolean isPdf(byte[] fileBytes) {
try {
Loader.loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes)));
} catch (IOException e) {
return false;
}
return true;
}

/**
* Returns true if the source PDF has source text inside. Returns false for images.
*
* @param fileBytes A byte array representing a PDF.
* @return True if at least one character exists in one page.
* @throws MindeeException if the file could not be read.
*/
public static boolean hasSourceText(byte[] fileBytes) {
try {
PDDocument document = Loader
.loadPDF(new RandomAccessReadBuffer(new ByteArrayInputStream(fileBytes)));
PDFTextStripper stripper = new PDFTextStripper();

for (int i = 0; i < document.getNumberOfPages(); i++) {
stripper.setStartPage(i + 1);
stripper.setEndPage(i + 1);
String pageText = stripper.getText(document);
if (!pageText.trim().isEmpty()) {
document.close();
return true;
}
}
document.close();
} catch (IOException e) {
return false;
}

return false;
}
}
73 changes: 46 additions & 27 deletions src/main/java/com/mindee/input/LocalInputSource.java
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
package com.mindee.input;

import com.mindee.image.ImageCompressor;
import com.mindee.pdf.PDFBoxApi;
import com.mindee.pdf.PDFCompression;
import com.mindee.pdf.PDFCompressor;
import com.mindee.pdf.PDFOperation;
import com.mindee.pdf.PDFInputOperation;
import com.mindee.pdf.PDFInputOperator;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
Expand All @@ -17,14 +18,18 @@
/**
* A source document for Mindee API operations.
*/
public final class LocalInputSource {
public class LocalInputSource {

@Getter
private byte[] file;
@Getter
private final String filename;
@Setter
private PDFOperation pdfOperation;
private PDFInputOperation pdfInputOperator;
@Setter
private PDFCompression pdfCompressor;
// Store here to avoid recalculating every time.
private Boolean isPDF;

public LocalInputSource(InputStream file, String filename) throws IOException {
this.file = IOUtils.toByteArray(file);
Expand Down Expand Up @@ -57,11 +62,18 @@ public LocalInputSource(String fileAsBase64, String filename) {
this.filename = filename;
}

public PDFOperation getPdfOperation() {
if (this.pdfOperation == null) {
this.pdfOperation = new PDFBoxApi();
private PDFInputOperation getPDFInputOperator() {
if (this.pdfInputOperator == null) {
this.pdfInputOperator = new PDFInputOperator();
}
return this.pdfInputOperator;
}

private PDFCompression getPDFCompressor() {
if (this.pdfCompressor == null) {
this.pdfCompressor = new PDFCompressor();
}
return this.pdfOperation;
return this.pdfCompressor;
}

/**
Expand All @@ -71,10 +83,10 @@ public PDFOperation getPdfOperation() {
* @throws IOException If an I/O error occurs during the PDF operation.
*/
public int getPageCount() throws IOException {
if (!this.isPdf()) {
if (!this.isPDF()) {
return 1;
}
return getPdfOperation().getNumberOfPages(this.file);
return getPDFInputOperator().getPageCount(this.file);
}

/**
Expand All @@ -84,51 +96,58 @@ public int getPageCount() throws IOException {
* @throws IOException If an I/O error occurs during the PDF operation.
*/
public void applyPageOptions(PageOptions pageOptions) throws IOException {
if (pageOptions != null && this.isPdf()) {
this.file = getPdfOperation().split(this.file, pageOptions).getFile();
if (pageOptions != null && this.isPDF()) {
this.file = getPDFInputOperator().split(this.file, pageOptions).getFile();
}
}

public boolean isPdf() {
return InputSourceUtils.isPdf(this.file);
}

public boolean hasSourceText() {
return InputSourceUtils.hasSourceText(this.file);
/**
* Returns true if the file is a PDF.
*/
public boolean isPDF() {
if (this.isPDF == null) {
this.isPDF = getPDFInputOperator().isPDF(this.file);
}
return this.isPDF;
}

public void compress(
Integer quality,
int quality,
Integer maxWidth,
Integer maxHeight,
Boolean forceSourceText,
Boolean disableSourceText
) throws IOException {
if (isPdf()) {
this.file = PDFCompressor.compressPdf(this.file, quality, forceSourceText, disableSourceText);
if (isPDF()) {
this.file = getPDFCompressor()
.compressPDF(this.file, quality, forceSourceText, disableSourceText);
} else {
this.file = ImageCompressor.compressImage(this.file, quality, maxWidth, maxHeight);
}
}

public void compress(
Integer quality,
int quality,
Integer maxWidth,
Integer maxHeight,
Boolean forceSourceText
) throws IOException {
this.compress(quality, maxWidth, maxHeight, forceSourceText, true);
}

public void compress(Integer quality, Integer maxWidth, Integer maxHeight) throws IOException {
this.compress(quality, maxWidth, maxHeight, false, true);
public void compress(
int quality,
boolean forceSourceText,
boolean disableSourceText
) throws IOException {
this.compress(quality, null, null, forceSourceText, disableSourceText);
}

public void compress(Integer quality, Integer maxWidth) throws IOException {
this.compress(quality, maxWidth, null, false, true);
public void compress(int quality, Integer maxWidth, Integer maxHeight) throws IOException {
this.compress(quality, maxWidth, maxHeight, false, true);
}

public void compress(Integer quality) throws IOException {
public void compress(int quality) throws IOException {
this.compress(quality, null, null, false, true);
}

Expand Down
39 changes: 4 additions & 35 deletions src/main/java/com/mindee/pdf/BasePDFExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
Expand All @@ -33,9 +32,9 @@ public class BasePDFExtractor {
* @param source The local source.
* @throws IOException Throws if the file can't be accessed.
*/
protected BasePDFExtractor(LocalInputSource source) throws IOException {
public BasePDFExtractor(LocalInputSource source) throws IOException {
this.filename = source.getFilename();
if (source.isPdf()) {
if (source.isPDF()) {
this.sourcePdf = Loader.loadPDF(source.getFile());
} else {
var document = new PDDocument();
Expand All @@ -57,15 +56,6 @@ protected BasePDFExtractor(LocalInputSource source) throws IOException {
}
}

/**
* Get the number of pages in the PDF file.
*
* @return The number of pages in the PDF file.
*/
public int getPageCount() {
return sourcePdf.getNumberOfPages();
}

/**
* Converts an array to a buffered image.
*
Expand Down Expand Up @@ -106,10 +96,7 @@ public List<ExtractedPDF> extractSubDocuments(
+ splitName[1];
extractedPDFs
.add(
new ExtractedPDF(
Loader.loadPDF(mergePdfPages(this.sourcePdf, pageIndexElement, false)),
fieldFilename
)
new ExtractedPDF(mergePdfPages(this.sourcePdf, pageIndexElement, false), fieldFilename)
);
}
return extractedPDFs;
Expand Down Expand Up @@ -149,25 +136,7 @@ private static byte[] createPdfFromExistingPdf(
return output;
}

/**
* Merge specified PDF pages together.
*
* @param file The PDF file.
* @param pageNumbers Lit of page numbers to merge together.
*/
public static byte[] mergePdfPages(File file, List<Integer> pageNumbers) throws IOException {
PDDocument document = Loader.loadPDF(file);
return createPdfFromExistingPdf(document, pageNumbers, true);
}

public static byte[] mergePdfPages(
PDDocument document,
List<Integer> pageNumbers
) throws IOException {
return mergePdfPages(document, pageNumbers, true);
}

public static byte[] mergePdfPages(
public byte[] mergePdfPages(
PDDocument document,
List<Integer> pageNumbers,
boolean closeOriginal
Expand Down
Loading
Loading