-
Notifications
You must be signed in to change notification settings - Fork 74
MLE-26420 Can now perform incremental writes #1868
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
57 changes: 57 additions & 0 deletions
57
...pi/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteEvalFilter.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,57 @@ | ||
| /* | ||
| * Copyright (c) 2010-2025 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved. | ||
| */ | ||
| package com.marklogic.client.datamovement.filter; | ||
|
|
||
| import com.fasterxml.jackson.databind.JsonNode; | ||
| import com.fasterxml.jackson.databind.ObjectMapper; | ||
| import com.fasterxml.jackson.databind.node.ArrayNode; | ||
| import com.marklogic.client.datamovement.DocumentWriteSetFilter; | ||
| import com.marklogic.client.document.DocumentWriteOperation; | ||
| import com.marklogic.client.document.DocumentWriteSet; | ||
| import com.marklogic.client.io.JacksonHandle; | ||
|
|
||
| import java.util.function.Consumer; | ||
|
|
||
| /** | ||
| * Uses server-side JavaScript code to get the existing hash values for a set of URIs. | ||
| * | ||
| * @since 8.1.0 | ||
| */ | ||
| class IncrementalWriteEvalFilter extends IncrementalWriteFilter { | ||
|
|
||
| private static final String EVAL_SCRIPT = """ | ||
| const tuples = cts.valueTuples([cts.uriReference(), cts.fieldReference(fieldName)], null, cts.documentQuery(uris)); | ||
| const response = {}; | ||
| for (var tuple of tuples) { | ||
| response[tuple[0]] = tuple[1]; | ||
| } | ||
| response | ||
| """; | ||
|
|
||
| IncrementalWriteEvalFilter(String fieldName, boolean canonicalizeJson, Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer) { | ||
| super(fieldName, canonicalizeJson, skippedDocumentsConsumer); | ||
| } | ||
|
|
||
| @Override | ||
| public DocumentWriteSet apply(DocumentWriteSetFilter.Context context) { | ||
| ArrayNode uris = new ObjectMapper().createArrayNode(); | ||
| for (DocumentWriteOperation doc : context.getDocumentWriteSet()) { | ||
| if (DocumentWriteOperation.OperationType.DOCUMENT_WRITE.equals(doc.getOperationType())) { | ||
| uris.add(doc.getUri()); | ||
| } | ||
| } | ||
|
|
||
| JsonNode response = context.getDatabaseClient().newServerEval().javascript(EVAL_SCRIPT) | ||
| .addVariable("fieldName", fieldName) | ||
| .addVariable("uris", new JacksonHandle(uris)) | ||
| .evalAs(JsonNode.class); | ||
|
|
||
| return filterDocuments(context, uri -> { | ||
| if (response.has(uri)) { | ||
| return response.get(uri).asText(); | ||
| } | ||
| return null; | ||
| }); | ||
| } | ||
| } |
187 changes: 187 additions & 0 deletions
187
...nt-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteFilter.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,187 @@ | ||
| /* | ||
| * Copyright (c) 2010-2025 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved. | ||
| */ | ||
| package com.marklogic.client.datamovement.filter; | ||
|
|
||
| import com.marklogic.client.datamovement.DocumentWriteSetFilter; | ||
| import com.marklogic.client.document.DocumentWriteOperation; | ||
| import com.marklogic.client.document.DocumentWriteSet; | ||
| import com.marklogic.client.impl.DocumentWriteOperationImpl; | ||
| import com.marklogic.client.impl.HandleAccessor; | ||
| import com.marklogic.client.io.BaseHandle; | ||
| import com.marklogic.client.io.DocumentMetadataHandle; | ||
| import com.marklogic.client.io.Format; | ||
| import net.openhft.hashing.LongHashFunction; | ||
| import org.erdtman.jcs.JsonCanonicalizer; | ||
| import org.slf4j.Logger; | ||
| import org.slf4j.LoggerFactory; | ||
|
|
||
| import java.io.IOException; | ||
| import java.nio.charset.StandardCharsets; | ||
| import java.util.ArrayList; | ||
| import java.util.List; | ||
| import java.util.function.Consumer; | ||
| import java.util.function.Function; | ||
|
|
||
| /** | ||
| * A DocumentWriteSetFilter that skips writing documents whose content has not changed since the last write | ||
| * based on a hash value stored in a MarkLogic field. | ||
| * | ||
| * @since 8.1.0 | ||
| */ | ||
| public abstract class IncrementalWriteFilter implements DocumentWriteSetFilter { | ||
|
|
||
| protected final Logger logger = LoggerFactory.getLogger(this.getClass()); | ||
|
|
||
| public static Builder newBuilder() { | ||
| return new Builder(); | ||
| } | ||
|
|
||
| public static class Builder { | ||
|
|
||
| private String fieldName = "incrementalWriteHash"; | ||
| private boolean canonicalizeJson = true; | ||
| private boolean useEvalQuery = false; | ||
| private Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer; | ||
|
|
||
| /** | ||
| * @param fieldName the name of the MarkLogic field that will hold the hash value; defaults to "incrementalWriteHash". | ||
| */ | ||
| public Builder fieldName(String fieldName) { | ||
| this.fieldName = fieldName; | ||
| return this; | ||
| } | ||
|
|
||
| /** | ||
| * @param canonicalizeJson whether to canonicalize JSON content before hashing; defaults to true. | ||
| * Delegates to https://github.com/erdtman/java-json-canonicalization for canonicalization. | ||
| */ | ||
| public Builder canonicalizeJson(boolean canonicalizeJson) { | ||
| this.canonicalizeJson = canonicalizeJson; | ||
| return this; | ||
| } | ||
|
|
||
| /** | ||
| * @param useEvalQuery if true, evaluate server-side JavaScript instead of an Optic query for retrieving hash values; defaults to false. | ||
| */ | ||
| public Builder useEvalQuery(boolean useEvalQuery) { | ||
| this.useEvalQuery = useEvalQuery; | ||
| return this; | ||
| } | ||
|
|
||
| /** | ||
| * @param skippedDocumentsConsumer a consumer that will be called with any documents in a batch that were skipped because their content had not changed. | ||
| */ | ||
| public Builder onDocumentsSkipped(Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer) { | ||
| this.skippedDocumentsConsumer = skippedDocumentsConsumer; | ||
| return this; | ||
| } | ||
|
|
||
| public IncrementalWriteFilter build() { | ||
| if (useEvalQuery) { | ||
| return new IncrementalWriteEvalFilter(fieldName, canonicalizeJson, skippedDocumentsConsumer); | ||
| } | ||
| return new IncrementalWriteOpticFilter(fieldName, canonicalizeJson, skippedDocumentsConsumer); | ||
| } | ||
| } | ||
|
|
||
| protected final String fieldName; | ||
| private final boolean canonicalizeJson; | ||
| private final Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer; | ||
|
|
||
| // Hardcoding this for now, with a good general purpose hashing function. | ||
| // See https://xxhash.com for benchmarks. | ||
| private final LongHashFunction hashFunction = LongHashFunction.xx3(); | ||
|
|
||
| public IncrementalWriteFilter(String fieldName, boolean canonicalizeJson, Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer) { | ||
| this.fieldName = fieldName; | ||
| this.canonicalizeJson = canonicalizeJson; | ||
| this.skippedDocumentsConsumer = skippedDocumentsConsumer; | ||
| } | ||
|
|
||
| protected final DocumentWriteSet filterDocuments(Context context, Function<String, String> hashRetriever) { | ||
| final DocumentWriteSet newWriteSet = context.getDatabaseClient().newDocumentManager().newWriteSet(); | ||
| final List<DocumentWriteOperation> skippedDocuments = new ArrayList<>(); | ||
|
|
||
| for (DocumentWriteOperation doc : context.getDocumentWriteSet()) { | ||
| if (!DocumentWriteOperation.OperationType.DOCUMENT_WRITE.equals(doc.getOperationType())) { | ||
| newWriteSet.add(doc); | ||
| continue; | ||
| } | ||
|
|
||
| final String contentHash = serializeContent(doc); | ||
| final String existingHash = hashRetriever.apply(doc.getUri()); | ||
| if (logger.isTraceEnabled()) { | ||
| logger.trace("URI: {}, existing Hash: {}, new Hash: {}", doc.getUri(), existingHash, contentHash); | ||
| } | ||
|
|
||
| if (existingHash != null) { | ||
| if (!existingHash.equals(contentHash)) { | ||
| newWriteSet.add(addHashToMetadata(doc, fieldName, contentHash)); | ||
| } else if (skippedDocumentsConsumer != null) { | ||
| skippedDocuments.add(doc); | ||
| } else { | ||
| // No consumer, so skip the document silently. | ||
| } | ||
| } else { | ||
| newWriteSet.add(addHashToMetadata(doc, fieldName, contentHash)); | ||
| } | ||
| } | ||
|
|
||
| if (!skippedDocuments.isEmpty()) { | ||
| skippedDocumentsConsumer.accept(skippedDocuments.toArray(new DocumentWriteOperation[0])); | ||
| } | ||
|
|
||
| return newWriteSet; | ||
| } | ||
|
|
||
| private String serializeContent(DocumentWriteOperation doc) { | ||
| String content = HandleAccessor.contentAsString(doc.getContent()); | ||
|
|
||
| Format format = null; | ||
| if (doc.getContent() instanceof BaseHandle<?, ?> baseHandle) { | ||
| format = baseHandle.getFormat(); | ||
| } | ||
|
|
||
| if (canonicalizeJson && (Format.JSON.equals(format) || isPossiblyJsonContent(content))) { | ||
| JsonCanonicalizer jc; | ||
| try { | ||
| jc = new JsonCanonicalizer(content); | ||
| return jc.getEncodedString(); | ||
| } catch (IOException e) { | ||
| // Going to improve this in the next PR, as I think we can throw an exception if Format = JSON. | ||
| logger.warn("Unable to canonicalize JSON content for URI {}, using original content for hashing; cause: {}", | ||
| doc.getUri(), e.getMessage()); | ||
| } | ||
| } | ||
|
|
||
| return content; | ||
| } | ||
|
|
||
| private boolean isPossiblyJsonContent(String content) { | ||
| // This isn't 100% reliable, as the content could be text that just happens to start with { or [, and so | ||
| // we'll still need to catch an exception if we try to canonicalize non-JSON content. | ||
| String trimmed = content.trim(); | ||
| return trimmed.startsWith("{") || trimmed.startsWith("["); | ||
| } | ||
|
|
||
| private String computeHash(String content) { | ||
| byte[] bytes = content.getBytes(StandardCharsets.UTF_8); | ||
| long hash = hashFunction.hashBytes(bytes); | ||
| return Long.toHexString(hash); | ||
| } | ||
|
|
||
| protected static DocumentWriteOperation addHashToMetadata(DocumentWriteOperation op, String fieldName, String hash) { | ||
| DocumentMetadataHandle newMetadata = new DocumentMetadataHandle(); | ||
| if (op.getMetadata() != null) { | ||
| DocumentMetadataHandle originalMetadata = (DocumentMetadataHandle) op.getMetadata(); | ||
| newMetadata.setPermissions(originalMetadata.getPermissions()); | ||
| newMetadata.setCollections(originalMetadata.getCollections()); | ||
| newMetadata.setQuality(originalMetadata.getQuality()); | ||
| newMetadata.setProperties(originalMetadata.getProperties()); | ||
| newMetadata.getMetadataValues().putAll(originalMetadata.getMetadataValues()); | ||
| } | ||
| newMetadata.getMetadataValues().put(fieldName, hash); | ||
| return new DocumentWriteOperationImpl(op.getUri(), newMetadata, op.getContent(), op.getTemporalDocumentURI()); | ||
| } | ||
| } | ||
55 changes: 55 additions & 0 deletions
55
...i/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteOpticFilter.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,55 @@ | ||
| /* | ||
| * Copyright (c) 2010-2025 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved. | ||
| */ | ||
| package com.marklogic.client.datamovement.filter; | ||
|
|
||
| import com.marklogic.client.document.DocumentWriteOperation; | ||
| import com.marklogic.client.document.DocumentWriteSet; | ||
| import com.marklogic.client.row.RowTemplate; | ||
|
|
||
| import java.util.HashMap; | ||
| import java.util.Map; | ||
| import java.util.function.Consumer; | ||
|
|
||
| /** | ||
| * Uses an Optic query to get the existing hash values for a set of URIs. | ||
| * | ||
| * @since 8.1.0 | ||
| */ | ||
| class IncrementalWriteOpticFilter extends IncrementalWriteFilter { | ||
|
|
||
| IncrementalWriteOpticFilter(String fieldName, boolean canonicalizeJson, Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer) { | ||
| super(fieldName, canonicalizeJson, skippedDocumentsConsumer); | ||
| } | ||
|
|
||
| @Override | ||
| public DocumentWriteSet apply(Context context) { | ||
| final String[] uris = context.getDocumentWriteSet().stream() | ||
| .filter(op -> DocumentWriteOperation.OperationType.DOCUMENT_WRITE.equals(op.getOperationType())) | ||
| .map(DocumentWriteOperation::getUri) | ||
| .toArray(String[]::new); | ||
|
|
||
| // It doesn't seem possible yet to use a DSL query and bind an array of strings to a "uris" param, so using | ||
| // a serialized query instead. That doesn't allow a user to override the query though. | ||
| Map<String, String> existingHashes = new RowTemplate(context.getDatabaseClient()).query(op -> | ||
| op.fromLexicons(Map.of( | ||
| "uri", op.cts.uriReference(), | ||
| "hash", op.cts.fieldReference(super.fieldName) | ||
| )).where( | ||
| op.cts.documentQuery(op.xs.stringSeq(uris)) | ||
| ), | ||
|
|
||
| rows -> { | ||
| Map<String, String> map = new HashMap<>(); | ||
| rows.forEach(row -> { | ||
| String uri = row.getString("uri"); | ||
| String existingHash = row.getString("hash"); | ||
| map.put(uri, existingHash); | ||
| }); | ||
| return map; | ||
| } | ||
| ); | ||
|
|
||
| return filterDocuments(context, uri -> existingHashes.get(uri)); | ||
| } | ||
| } |
48 changes: 48 additions & 0 deletions
48
...pi/src/test/java/com/marklogic/client/datamovement/filter/IncrementalWriteFilterTest.java
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,48 @@ | ||
| /* | ||
| * Copyright (c) 2010-2025 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved. | ||
| */ | ||
| package com.marklogic.client.datamovement.filter; | ||
|
|
||
| import com.marklogic.client.document.DocumentWriteOperation; | ||
| import com.marklogic.client.impl.DocumentWriteOperationImpl; | ||
| import com.marklogic.client.io.DocumentMetadataHandle; | ||
| import com.marklogic.client.io.StringHandle; | ||
| import org.junit.jupiter.api.Test; | ||
|
|
||
| import static org.junit.jupiter.api.Assertions.assertEquals; | ||
|
|
||
| /** | ||
| * Unit tests that make no connection to MarkLogic. | ||
| */ | ||
| class IncrementalWriteFilterTest { | ||
|
|
||
| /** | ||
| * Verifies that when a hash is added, a new metadata object is created so that a doc-specific hash field can be | ||
| * added without affecting any other document that might be sharing the same metadata object. | ||
| */ | ||
| @Test | ||
| void addHashToMetadata() { | ||
| DocumentMetadataHandle metadata = new DocumentMetadataHandle() | ||
| .withCollections("c1") | ||
| .withPermission("rest-reader", DocumentMetadataHandle.Capability.READ) | ||
| .withQuality(2) | ||
| .withProperty("prop1", "value1") | ||
| .withMetadataValue("meta1", "value1"); | ||
|
|
||
| DocumentWriteOperation doc1 = new DocumentWriteOperationImpl("/1.xml", metadata, new StringHandle("<doc1/>")); | ||
| DocumentWriteOperation doc2 = new DocumentWriteOperationImpl("/2.xml", metadata, new StringHandle("<doc2/>")); | ||
|
|
||
| doc2 = IncrementalWriteFilter.addHashToMetadata(doc2, "theField", "abc123"); | ||
|
|
||
| assertEquals(metadata, doc1.getMetadata(), "doc1 should still have the original metadata object"); | ||
|
|
||
| DocumentMetadataHandle metadata2 = (DocumentMetadataHandle) doc2.getMetadata(); | ||
| assertEquals("c1", metadata2.getCollections().iterator().next(), "collection should be preserved"); | ||
| assertEquals(DocumentMetadataHandle.Capability.READ, metadata2.getPermissions().get("rest-reader").iterator().next(), "permission should be preserved"); | ||
| assertEquals(2, metadata2.getQuality(), "quality should be preserved"); | ||
| assertEquals("value1", metadata2.getProperties().get("prop1"), "property should be preserved"); | ||
|
|
||
| assertEquals("value1", metadata2.getMetadataValues().get("meta1"), "metadata value should be preserved"); | ||
| assertEquals("abc123", metadata2.getMetadataValues().get("theField"), "hash field should be added"); | ||
| } | ||
| } |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.