From 39aaf49c04b61ae7473ef20e36f00177a39165cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ianar=C3=A9=20S=C3=A9vi?= Date: Mon, 27 Apr 2026 18:12:26 +0200 Subject: [PATCH] :recycle: rework URL inputs --- .../com/mindee/input/InputSourceUtils.java | 10 ---- .../java/com/mindee/input/URLInputSource.java | 58 +++++++++++++------ src/main/java/com/mindee/v1/MindeeClient.java | 29 +++++----- .../com/mindee/v1/http/RequestParameters.java | 10 +++- src/main/java/com/mindee/v2/MindeeClient.java | 10 +++- .../com/mindee/v2/http/MindeeHttpApiV2.java | 2 +- .../mindee/input/LocalInputSourceTest.java | 50 ++++++---------- .../com/mindee/input/URLInputSourceTest.java | 9 +-- .../java/com/mindee/v1/MindeeClientTest.java | 4 +- .../mindee/v1/http/MindeeHttpApiV1Test.java | 4 +- 10 files changed, 97 insertions(+), 89 deletions(-) diff --git a/src/main/java/com/mindee/input/InputSourceUtils.java b/src/main/java/com/mindee/input/InputSourceUtils.java index 1c1b1eff4..076320b56 100644 --- a/src/main/java/com/mindee/input/InputSourceUtils.java +++ b/src/main/java/com/mindee/input/InputSourceUtils.java @@ -3,7 +3,6 @@ import com.mindee.MindeeException; import java.io.ByteArrayInputStream; import java.io.IOException; -import java.net.URL; import org.apache.pdfbox.Loader; import org.apache.pdfbox.io.RandomAccessReadBuffer; import org.apache.pdfbox.pdmodel.PDDocument; @@ -79,15 +78,6 @@ public static boolean isPdf(byte[] fileBytes) { return true; } - /** - * Ensures the URL can be sent to the Mindee server. - */ - public static void validateUrl(URL inputUrl) { - if (!"https".equalsIgnoreCase(inputUrl.getProtocol())) { - throw new MindeeException("Only HTTPS source URLs are allowed"); - } - } - /** * Returns true if the source PDF has source text inside. Returns false for images. * diff --git a/src/main/java/com/mindee/input/URLInputSource.java b/src/main/java/com/mindee/input/URLInputSource.java index 7d0b19e39..479f2ba5c 100644 --- a/src/main/java/com/mindee/input/URLInputSource.java +++ b/src/main/java/com/mindee/input/URLInputSource.java @@ -1,11 +1,13 @@ package com.mindee.input; +import com.mindee.MindeeException; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.net.HttpURLConnection; +import java.net.MalformedURLException; import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; @@ -19,7 +21,7 @@ */ public class URLInputSource { @Getter - private final String url; + private final URL url; private final String username; private final String password; @Getter @@ -43,20 +45,21 @@ public class URLInputSource { * @param url URL to fetch the file from. * @return An instance of {@link URLInputSource}. */ - public static Builder builder(String url) { - return new Builder(url); + public static Builder builder(String url) throws MalformedURLException { + return new Builder(new URL(url)); } - private HttpURLConnection prepareConnection() throws IOException { - HttpURLConnection connection = createConnection(url); - connection = handleRedirects(connection); + public static Builder builder(URL url) { + return new Builder(url); + } - int responseCode = connection.getResponseCode(); - if (responseCode != HttpURLConnection.HTTP_OK) { - throw new IOException("Failed to fetch file: " + responseCode); + /** + * Ensures the URL can be sent to the Mindee server. + */ + public void validateSecure() { + if (!"https".equalsIgnoreCase(this.url.getProtocol())) { + throw new MindeeException("Only HTTPS source URLs are allowed"); } - - return connection; } /** @@ -72,8 +75,20 @@ public void fetchFile() throws IOException { } } - protected HttpURLConnection createConnection(String urlString) throws IOException { - HttpURLConnection connection = (HttpURLConnection) new URL(urlString).openConnection(); + private HttpURLConnection prepareConnection() throws IOException { + HttpURLConnection connection = createConnection(url); + connection = handleRedirects(connection); + + int responseCode = connection.getResponseCode(); + if (responseCode != HttpURLConnection.HTTP_OK) { + throw new IOException("Failed to fetch file: " + responseCode); + } + + return connection; + } + + protected HttpURLConnection createConnection(URL url) throws IOException { + HttpURLConnection connection = (HttpURLConnection) url.openConnection(); connection.setInstanceFollowRedirects(true); if (username != null && password != null) { @@ -101,7 +116,7 @@ private HttpURLConnection handleRedirects(HttpURLConnection connection) throws I String newUrl = connection.getHeaderField("Location"); connection.disconnect(); - HttpURLConnection newConnection = createConnection(newUrl); + HttpURLConnection newConnection = createConnection(new URL(newUrl)); return handleRedirects(newConnection); // Recursive call to handle multiple redirects } return connection; @@ -189,18 +204,27 @@ public void cleanup() { * Builder class for an URLInputSource. */ public static class Builder { - private final String url; + private final URL url; private String username; private String password; private String localFilename; private String token; /** - * Default constructor. + * String constructor. + * + * @param url Remote URL resource. + */ + public Builder(String url) throws MalformedURLException { + this.url = new URL(url); + } + + /** + * URL constructor. * * @param url Remote URL resource. */ - public Builder(String url) { + public Builder(URL url) { this.url = url; } diff --git a/src/main/java/com/mindee/v1/MindeeClient.java b/src/main/java/com/mindee/v1/MindeeClient.java index 5f353414c..cd167d2b1 100644 --- a/src/main/java/com/mindee/v1/MindeeClient.java +++ b/src/main/java/com/mindee/v1/MindeeClient.java @@ -1,9 +1,9 @@ package com.mindee.v1; import com.mindee.MindeeException; -import com.mindee.input.InputSourceUtils; import com.mindee.input.LocalInputSource; import com.mindee.input.PageOptions; +import com.mindee.input.URLInputSource; import com.mindee.pdf.PDFBoxApi; import com.mindee.pdf.PDFOperation; import com.mindee.v1.clientOptions.PollingOptions; @@ -203,7 +203,6 @@ public AsyncPredictResponse enqueue( Class type, URL sourceUrl ) throws IOException { - InputSourceUtils.validateUrl(sourceUrl); return this.enqueue(type, new Endpoint(type), null, null, null, sourceUrl); } @@ -222,7 +221,6 @@ public AsyncPredictResponse enqueue( URL sourceUrl, PredictOptions predictOptions ) throws IOException { - InputSourceUtils.validateUrl(sourceUrl); return this.enqueue(type, new Endpoint(type), null, null, predictOptions, sourceUrl); } @@ -232,8 +230,12 @@ private AsyncPredictResponse enqueue( byte[] file, String filename, PredictOptions predictOptions, - URL urlInputSource + URL url ) throws IOException { + URLInputSource urlInputSource = null; + if (url != null) { + urlInputSource = new URLInputSource.Builder(url).build(); + } RequestParameters params = RequestParameters .builder() .file(file) @@ -402,7 +404,6 @@ public AsyncPredictResponse enqueueAndParse( Class type, URL sourceUrl ) throws IOException, InterruptedException { - InputSourceUtils.validateUrl(sourceUrl); return this.enqueueAndParse(type, new Endpoint(type), null, null, null, null, sourceUrl); } @@ -441,11 +442,12 @@ private AsyncPredictResponse enqueueAndParse( byte[] file, String filename, PredictOptions predictOptions, - URL urlInputSource + URL url ) throws IOException, InterruptedException { if (pollingOptions == null) { pollingOptions = PollingOptions.builder().build(); } + this.validateAsyncParams(pollingOptions); final int initialDelaySec = (int) (pollingOptions.getInitialDelaySec() * 1000); final int intervalSec = (int) (pollingOptions.getIntervalSec() * 1000); @@ -456,7 +458,7 @@ private AsyncPredictResponse enqueueAndParse( file, filename, predictOptions, - urlInputSource + url ); String jobId = enqueueResponse.getJob().getId(); @@ -648,7 +650,6 @@ public PredictResponse parse( Class type, URL urlInputSource ) throws IOException { - InputSourceUtils.validateUrl(urlInputSource); return this.parse(type, new Endpoint(type), null, null, null, urlInputSource); } @@ -667,7 +668,6 @@ public PredictResponse parse( URL urlInputSource, PredictOptions predictOptions ) throws IOException { - InputSourceUtils.validateUrl(urlInputSource); return this.parse(type, new Endpoint(type), null, null, predictOptions, urlInputSource); } @@ -677,8 +677,12 @@ private PredictResponse parse( byte[] file, String filename, PredictOptions predictOptions, - URL urlInputSource + URL url ) throws IOException { + URLInputSource urlInputSource = null; + if (url != null) { + urlInputSource = new URLInputSource.Builder(url).build(); + } RequestParameters params = RequestParameters .builder() .file(file) @@ -760,7 +764,6 @@ public AsyncPredictResponse enqueue( Endpoint endpoint, URL sourceUrl ) throws IOException { - InputSourceUtils.validateUrl(sourceUrl); return this.enqueue(type, endpoint, null, null, null, sourceUrl); } @@ -781,7 +784,6 @@ public AsyncPredictResponse enqueue( URL sourceUrl, PredictOptions predictOptions ) throws IOException { - InputSourceUtils.validateUrl(sourceUrl); return this.enqueue(type, endpoint, null, null, predictOptions, sourceUrl); } @@ -893,7 +895,6 @@ public AsyncPredictResponse enqueueAndParse( Endpoint endpoint, URL sourceUrl ) throws IOException, InterruptedException { - InputSourceUtils.validateUrl(sourceUrl); return this.enqueueAndParse(type, endpoint, null, null, null, null, sourceUrl); } @@ -1024,7 +1025,6 @@ public PredictResponse parse( Endpoint endpoint, URL documentUrl ) throws IOException { - InputSourceUtils.validateUrl(documentUrl); return this.parse(type, endpoint, null, null, null, documentUrl); } @@ -1045,7 +1045,6 @@ public PredictResponse parse( URL documentUrl, PredictOptions predictOptions ) throws IOException { - InputSourceUtils.validateUrl(documentUrl); return this.parse(type, endpoint, null, null, predictOptions, documentUrl); } diff --git a/src/main/java/com/mindee/v1/http/RequestParameters.java b/src/main/java/com/mindee/v1/http/RequestParameters.java index c227acd14..cf6ee925d 100644 --- a/src/main/java/com/mindee/v1/http/RequestParameters.java +++ b/src/main/java/com/mindee/v1/http/RequestParameters.java @@ -1,5 +1,6 @@ package com.mindee.v1.http; +import com.mindee.input.URLInputSource; import com.mindee.v1.clientOptions.PredictOptions; import com.mindee.v1.clientOptions.WorkflowOptions; import java.net.URL; @@ -20,7 +21,7 @@ public class RequestParameters { @Builder private RequestParameters( - URL urlInputSource, + URLInputSource urlInputSource, byte[] file, PredictOptions predictOptions, WorkflowOptions workflowOptions, @@ -39,7 +40,12 @@ private RequestParameters( } else { this.workflowOptions = workflowOptions; } - this.fileUrl = urlInputSource; + if (urlInputSource != null) { + urlInputSource.validateSecure(); + this.fileUrl = urlInputSource.getUrl(); + } else { + this.fileUrl = null; + } this.file = file; this.fileName = fileName; } diff --git a/src/main/java/com/mindee/v2/MindeeClient.java b/src/main/java/com/mindee/v2/MindeeClient.java index edbfe10e7..54cda7d76 100644 --- a/src/main/java/com/mindee/v2/MindeeClient.java +++ b/src/main/java/com/mindee/v2/MindeeClient.java @@ -54,6 +54,7 @@ public JobResponse enqueue( * @param params The parameters to send along with the file. */ public JobResponse enqueue(URLInputSource inputSource, BaseParameters params) throws IOException { + inputSource.validateSecure(); return mindeeApi.reqPostEnqueue(inputSource, params); } @@ -141,8 +142,12 @@ public TResponse enqueueAndGetResult( URLInputSource inputSource, BaseParameters params ) throws IOException, InterruptedException { - JobResponse job = enqueue(inputSource, params); - return pollAndFetch(responseClass, job, PollingOptions.builder().build()); + return enqueueAndGetResult( + responseClass, + inputSource, + params, + PollingOptions.builder().build() + ); } /** @@ -162,6 +167,7 @@ public TResponse enqueueAndGetResult( BaseParameters params, PollingOptions pollingOptions ) throws IOException, InterruptedException { + inputSource.validateSecure(); JobResponse job = enqueue(inputSource, params); return pollAndFetch(responseClass, job, pollingOptions); } diff --git a/src/main/java/com/mindee/v2/http/MindeeHttpApiV2.java b/src/main/java/com/mindee/v2/http/MindeeHttpApiV2.java index 48669a4b1..f8aea4718 100644 --- a/src/main/java/com/mindee/v2/http/MindeeHttpApiV2.java +++ b/src/main/java/com/mindee/v2/http/MindeeHttpApiV2.java @@ -101,7 +101,7 @@ public JobResponse reqPostEnqueue(URLInputSource inputSource, BaseParameters opt var builder = MultipartEntityBuilder.create(); builder.setMode(HttpMultipartMode.EXTENDED); - builder.addTextBody("url", inputSource.getUrl()); + builder.addTextBody("url", inputSource.getUrl().toString()); post.setEntity(options.buildHttpBody(builder).build()); return executeEnqueue(post); } diff --git a/src/test/java/com/mindee/input/LocalInputSourceTest.java b/src/test/java/com/mindee/input/LocalInputSourceTest.java index 6e7815c57..6482bf51e 100644 --- a/src/test/java/com/mindee/input/LocalInputSourceTest.java +++ b/src/test/java/com/mindee/input/LocalInputSourceTest.java @@ -16,29 +16,24 @@ public class LocalInputSourceTest { void assertMultipagePDF(LocalInputSource inputSource, Path filePath) throws IOException { Assertions.assertNotNull(inputSource); - String filename = inputSource.getFilename(); - boolean isPdf = inputSource.isPdf(); - boolean hasSourceText = inputSource.hasSourceText(); - int numberOfPages = inputSource.getPageCount(); - - Assertions.assertTrue(isPdf); - Assertions.assertTrue(hasSourceText); - Assertions.assertEquals(3, numberOfPages); - Assertions.assertEquals("multipage_cut-3.pdf", filename); + Assertions.assertTrue(inputSource.isPdf()); + Assertions.assertTrue(inputSource.hasSourceText()); + Assertions.assertEquals(3, inputSource.getPageCount()); + Assertions.assertEquals("multipage_cut-3.pdf", inputSource.getFilename()); Assertions.assertArrayEquals(inputSource.getFile(), Files.readAllBytes(filePath)); } @Test void loadPDF_withFile_mustReturnAValidLocalInputSource() throws IOException { File file = getResourcePath("file_types/pdf/multipage_cut-3.pdf").toFile(); - LocalInputSource localInputSource = new LocalInputSource(file); + var localInputSource = new LocalInputSource(file); assertMultipagePDF(localInputSource, file.toPath()); } @Test void loadPDF_withInputStream_mustReturnAValidLocalInputSource() throws IOException { Path filePath = getResourcePath("file_types/pdf/multipage_cut-3.pdf"); - LocalInputSource localInputSource = new LocalInputSource( + var localInputSource = new LocalInputSource( Files.newInputStream(filePath), "multipage_cut-3.pdf" ); @@ -48,7 +43,7 @@ void loadPDF_withInputStream_mustReturnAValidLocalInputSource() throws IOExcepti @Test void loadPDF_withByteArray_mustReturnAValidLocalInputSource() throws IOException { Path filePath = getResourcePath("file_types/pdf/multipage_cut-3.pdf"); - LocalInputSource localInputSource = new LocalInputSource( + var localInputSource = new LocalInputSource( Files.readAllBytes(filePath), "multipage_cut-3.pdf" ); @@ -59,7 +54,7 @@ void loadPDF_withByteArray_mustReturnAValidLocalInputSource() throws IOException void loadPDF_withBase64Encoded_mustReturnAValidLocalInputSource() throws IOException { Path filePath = getResourcePath("file_types/pdf/multipage_cut-3.pdf"); String encodedFile = Base64.encodeBase64String(Files.readAllBytes(filePath)); - LocalInputSource localInputSource = new LocalInputSource(encodedFile, "multipage_cut-3.pdf"); + var localInputSource = new LocalInputSource(encodedFile, "multipage_cut-3.pdf"); assertMultipagePDF(localInputSource, filePath); } @@ -67,7 +62,7 @@ void loadPDF_withBase64Encoded_mustReturnAValidLocalInputSource() throws IOExcep void loadPDF__withoutText_mustNotDetectSourceText() throws MindeeException, IOException { Path filePath = getV1ResourcePath("products/invoice_splitter/default_sample.pdf"); String encodedFile = Base64.encodeBase64String(Files.readAllBytes(filePath)); - LocalInputSource localInputSource = new LocalInputSource(encodedFile, "default_sample.pdf"); + var localInputSource = new LocalInputSource(encodedFile, "default_sample.pdf"); Assertions.assertNotNull(localInputSource); Assertions.assertTrue(localInputSource.isPdf()); Assertions.assertFalse(localInputSource.hasSourceText()); @@ -76,42 +71,31 @@ void loadPDF__withoutText_mustNotDetectSourceText() throws MindeeException, IOEx void assertImage(LocalInputSource inputSource, Path filePath) throws IOException { Assertions.assertNotNull(inputSource); - String filename = inputSource.getFilename(); - boolean isPdf = inputSource.isPdf(); - boolean hasSourceText = inputSource.hasSourceText(); - int numberOfPages = inputSource.getPageCount(); - - Assertions.assertFalse(isPdf); - Assertions.assertFalse(hasSourceText); - Assertions.assertEquals(1, numberOfPages); - Assertions.assertEquals("receipt.jpg", filename); + Assertions.assertFalse(inputSource.isPdf()); + Assertions.assertFalse(inputSource.hasSourceText()); + Assertions.assertEquals(1, inputSource.getPageCount()); + Assertions.assertEquals("receipt.jpg", inputSource.getFilename()); Assertions.assertArrayEquals(inputSource.getFile(), Files.readAllBytes(filePath)); } @Test void loadImage_withFile_mustReturnAValidLocalInputSource() throws IOException { File file = getResourcePath("file_types/receipt.jpg").toFile(); - LocalInputSource localInputSource = new LocalInputSource(file); + var localInputSource = new LocalInputSource(file); assertImage(localInputSource, file.toPath()); } @Test void loadImage_withInputStream_mustReturnAValidLocalInputSource() throws IOException { Path filePath = getResourcePath("file_types/receipt.jpg"); - LocalInputSource localInputSource = new LocalInputSource( - Files.newInputStream(filePath), - "receipt.jpg" - ); + var localInputSource = new LocalInputSource(Files.newInputStream(filePath), "receipt.jpg"); assertImage(localInputSource, filePath); } @Test void loadImage_withByteArray_mustReturnAValidLocalInputSource() throws IOException { Path filePath = getResourcePath("file_types/receipt.jpg"); - LocalInputSource localInputSource = new LocalInputSource( - Files.readAllBytes(filePath), - "receipt.jpg" - ); + var localInputSource = new LocalInputSource(Files.readAllBytes(filePath), "receipt.jpg"); assertImage(localInputSource, filePath); } @@ -119,7 +103,7 @@ void loadImage_withByteArray_mustReturnAValidLocalInputSource() throws IOExcepti void loadImage_withBase64Encoded_mustReturnAValidLocalInputSource() throws IOException { Path filePath = getResourcePath("file_types/receipt.jpg"); String encodedFile = Base64.encodeBase64String(Files.readAllBytes(filePath)); - LocalInputSource localInputSource = new LocalInputSource(encodedFile, "receipt.jpg"); + var localInputSource = new LocalInputSource(encodedFile, "receipt.jpg"); assertImage(localInputSource, filePath); } diff --git a/src/test/java/com/mindee/input/URLInputSourceTest.java b/src/test/java/com/mindee/input/URLInputSourceTest.java index 91aec4da5..3f78505dd 100644 --- a/src/test/java/com/mindee/input/URLInputSourceTest.java +++ b/src/test/java/com/mindee/input/URLInputSourceTest.java @@ -5,6 +5,8 @@ import java.io.File; import java.io.IOException; import java.net.HttpURLConnection; +import java.net.MalformedURLException; +import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; @@ -18,7 +20,7 @@ public class URLInputSourceTest { private TestableURLInputSource urlInputSource; @BeforeEach - public void setUp() { + public void setUp() throws MalformedURLException { urlInputSource = new TestableURLInputSource(TEST_URL); } @@ -92,13 +94,12 @@ static class TestableURLInputSource extends URLInputSource { private String mockRedirectUrl; private boolean isRedirected = false; - public TestableURLInputSource(String url) { + public TestableURLInputSource(String url) throws MalformedURLException { super(builder(url)); } @Override - protected HttpURLConnection createConnection(String urlString) throws IOException { - java.net.URL url = new java.net.URL(urlString); + protected HttpURLConnection createConnection(URL url) { boolean wasRedirected = isRedirected; if (!isRedirected && mockRedirectUrl != null) { diff --git a/src/test/java/com/mindee/v1/MindeeClientTest.java b/src/test/java/com/mindee/v1/MindeeClientTest.java index c2cc65079..265dffacd 100644 --- a/src/test/java/com/mindee/v1/MindeeClientTest.java +++ b/src/test/java/com/mindee/v1/MindeeClientTest.java @@ -113,9 +113,7 @@ void givenADocumentUrl_whenParsed_shouldCallApiWithCorrectParams() throws IOExce @Test void givenAnAsyncDoc_whenEnqueued_shouldInvokeApiCorrectly() throws IOException { - LocalInputSource localInputSource = new LocalInputSource( - getResourcePath("file_types/pdf/blank_1.pdf") - ); + var localInputSource = new LocalInputSource(getResourcePath("file_types/pdf/blank_1.pdf")); Job job = new Job(LocalDateTime.now(), "someid", LocalDateTime.now(), "Completed", null); diff --git a/src/test/java/com/mindee/v1/http/MindeeHttpApiV1Test.java b/src/test/java/com/mindee/v1/http/MindeeHttpApiV1Test.java index 9efd4dd19..f0e91ba0a 100644 --- a/src/test/java/com/mindee/v1/http/MindeeHttpApiV1Test.java +++ b/src/test/java/com/mindee/v1/http/MindeeHttpApiV1Test.java @@ -12,6 +12,7 @@ import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; import com.github.tomakehurst.wiremock.junit.WireMockRule; +import com.mindee.input.URLInputSource; import com.mindee.v1.MindeeSettings; import com.mindee.v1.clientOptions.PredictOptions; import com.mindee.v1.parsing.common.AsyncPredictResponse; @@ -21,7 +22,6 @@ import com.mindee.v1.product.invoicesplitter.InvoiceSplitterV1; import java.io.File; import java.io.IOException; -import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.util.List; @@ -189,7 +189,7 @@ void givenParseParametersWithFileUrl_whenParsed_shouldBuildRequestCorrectly() th .builder() .file(null) .fileName(null) - .urlInputSource(new URL("https://thisfile.does.not.exist")) + .urlInputSource(new URLInputSource.Builder("https://thisfile.does.not.exist").build()) .build() ) .getDocument();