Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions marklogic-client-api/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ dependencies {
implementation "com.fasterxml.jackson.core:jackson-databind:${jacksonVersion}"
implementation "com.fasterxml.jackson.dataformat:jackson-dataformat-csv:${jacksonVersion}"

// Dependencies for hash generation. Can be safely omitted if not using the incremental write feature. But neither
// has any transitive dependencies, and thus their impact on the dependency tree is minimal.
implementation "io.github.erdtman:java-json-canonicalization:1.1"
implementation "net.openhft:zero-allocation-hashing:0.27ea1"

// Only used by extras (which some examples then depend on)
compileOnly 'org.jdom:jdom2:2.0.6.1'
compileOnly 'org.dom4j:dom4j:2.2.0'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/*
* Copyright (c) 2010-2025 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
*/
package com.marklogic.client.datamovement.filter;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.marklogic.client.datamovement.DocumentWriteSetFilter;
import com.marklogic.client.document.DocumentWriteOperation;
import com.marklogic.client.document.DocumentWriteSet;
import com.marklogic.client.io.JacksonHandle;

import java.util.function.Consumer;

/**
* Uses server-side JavaScript code to get the existing hash values for a set of URIs.
*
* @since 8.1.0
*/
class IncrementalWriteEvalFilter extends IncrementalWriteFilter {

private static final String EVAL_SCRIPT = """
const tuples = cts.valueTuples([cts.uriReference(), cts.fieldReference(fieldName)], null, cts.documentQuery(uris));
const response = {};
for (var tuple of tuples) {
response[tuple[0]] = tuple[1];
}
response
""";

IncrementalWriteEvalFilter(String fieldName, boolean canonicalizeJson, Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer) {
super(fieldName, canonicalizeJson, skippedDocumentsConsumer);
}

@Override
public DocumentWriteSet apply(DocumentWriteSetFilter.Context context) {
ArrayNode uris = new ObjectMapper().createArrayNode();
for (DocumentWriteOperation doc : context.getDocumentWriteSet()) {
if (DocumentWriteOperation.OperationType.DOCUMENT_WRITE.equals(doc.getOperationType())) {
uris.add(doc.getUri());
}
}

JsonNode response = context.getDatabaseClient().newServerEval().javascript(EVAL_SCRIPT)
.addVariable("fieldName", fieldName)
.addVariable("uris", new JacksonHandle(uris))
.evalAs(JsonNode.class);

return filterDocuments(context, uri -> {
if (response.has(uri)) {
return response.get(uri).asText();
}
return null;
});
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
/*
* Copyright (c) 2010-2025 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
*/
package com.marklogic.client.datamovement.filter;

import com.marklogic.client.datamovement.DocumentWriteSetFilter;
import com.marklogic.client.document.DocumentWriteOperation;
import com.marklogic.client.document.DocumentWriteSet;
import com.marklogic.client.impl.DocumentWriteOperationImpl;
import com.marklogic.client.impl.HandleAccessor;
import com.marklogic.client.io.BaseHandle;
import com.marklogic.client.io.DocumentMetadataHandle;
import com.marklogic.client.io.Format;
import net.openhft.hashing.LongHashFunction;
import org.erdtman.jcs.JsonCanonicalizer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Consumer;
import java.util.function.Function;

/**
* A DocumentWriteSetFilter that skips writing documents whose content has not changed since the last write
* based on a hash value stored in a MarkLogic field.
*
* @since 8.1.0
*/
public abstract class IncrementalWriteFilter implements DocumentWriteSetFilter {

protected final Logger logger = LoggerFactory.getLogger(this.getClass());

public static Builder newBuilder() {
return new Builder();
}

public static class Builder {

private String fieldName = "incrementalWriteHash";
private boolean canonicalizeJson = true;
private boolean useEvalQuery = false;
private Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer;

/**
* @param fieldName the name of the MarkLogic field that will hold the hash value; defaults to "incrementalWriteHash".
*/
public Builder fieldName(String fieldName) {
this.fieldName = fieldName;
return this;
}

/**
* @param canonicalizeJson whether to canonicalize JSON content before hashing; defaults to true.
* Delegates to https://github.com/erdtman/java-json-canonicalization for canonicalization.
*/
public Builder canonicalizeJson(boolean canonicalizeJson) {
this.canonicalizeJson = canonicalizeJson;
return this;
}

/**
* @param useEvalQuery if true, evaluate server-side JavaScript instead of an Optic query for retrieving hash values; defaults to false.
*/
public Builder useEvalQuery(boolean useEvalQuery) {
this.useEvalQuery = useEvalQuery;
return this;
}

/**
* @param skippedDocumentsConsumer a consumer that will be called with any documents in a batch that were skipped because their content had not changed.
*/
public Builder onDocumentsSkipped(Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer) {
this.skippedDocumentsConsumer = skippedDocumentsConsumer;
return this;
}

public IncrementalWriteFilter build() {
if (useEvalQuery) {
return new IncrementalWriteEvalFilter(fieldName, canonicalizeJson, skippedDocumentsConsumer);
}
return new IncrementalWriteOpticFilter(fieldName, canonicalizeJson, skippedDocumentsConsumer);
}
}

protected final String fieldName;
private final boolean canonicalizeJson;
private final Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer;

// Hardcoding this for now, with a good general purpose hashing function.
// See https://xxhash.com for benchmarks.
private final LongHashFunction hashFunction = LongHashFunction.xx3();

public IncrementalWriteFilter(String fieldName, boolean canonicalizeJson, Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer) {
this.fieldName = fieldName;
this.canonicalizeJson = canonicalizeJson;
this.skippedDocumentsConsumer = skippedDocumentsConsumer;
}

protected final DocumentWriteSet filterDocuments(Context context, Function<String, String> hashRetriever) {
final DocumentWriteSet newWriteSet = context.getDatabaseClient().newDocumentManager().newWriteSet();
final List<DocumentWriteOperation> skippedDocuments = new ArrayList<>();

for (DocumentWriteOperation doc : context.getDocumentWriteSet()) {
if (!DocumentWriteOperation.OperationType.DOCUMENT_WRITE.equals(doc.getOperationType())) {
newWriteSet.add(doc);
continue;
}

final String contentHash = serializeContent(doc);
final String existingHash = hashRetriever.apply(doc.getUri());
if (logger.isTraceEnabled()) {
logger.trace("URI: {}, existing Hash: {}, new Hash: {}", doc.getUri(), existingHash, contentHash);
}

if (existingHash != null) {
if (!existingHash.equals(contentHash)) {
newWriteSet.add(addHashToMetadata(doc, fieldName, contentHash));
} else if (skippedDocumentsConsumer != null) {
skippedDocuments.add(doc);
} else {
// No consumer, so skip the document silently.
}
} else {
newWriteSet.add(addHashToMetadata(doc, fieldName, contentHash));
}
}

if (!skippedDocuments.isEmpty()) {
skippedDocumentsConsumer.accept(skippedDocuments.toArray(new DocumentWriteOperation[0]));
}

return newWriteSet;
}

private String serializeContent(DocumentWriteOperation doc) {
String content = HandleAccessor.contentAsString(doc.getContent());

Format format = null;
if (doc.getContent() instanceof BaseHandle<?, ?> baseHandle) {
format = baseHandle.getFormat();
}

if (canonicalizeJson && (Format.JSON.equals(format) || isPossiblyJsonContent(content))) {
JsonCanonicalizer jc;
try {
jc = new JsonCanonicalizer(content);
return jc.getEncodedString();
} catch (IOException e) {
// Going to improve this in the next PR, as I think we can throw an exception if Format = JSON.
logger.warn("Unable to canonicalize JSON content for URI {}, using original content for hashing; cause: {}",
doc.getUri(), e.getMessage());
}
}

return content;
}

private boolean isPossiblyJsonContent(String content) {
// This isn't 100% reliable, as the content could be text that just happens to start with { or [, and so
// we'll still need to catch an exception if we try to canonicalize non-JSON content.
String trimmed = content.trim();
return trimmed.startsWith("{") || trimmed.startsWith("[");
}

private String computeHash(String content) {
byte[] bytes = content.getBytes(StandardCharsets.UTF_8);
long hash = hashFunction.hashBytes(bytes);
return Long.toHexString(hash);
}

protected static DocumentWriteOperation addHashToMetadata(DocumentWriteOperation op, String fieldName, String hash) {
DocumentMetadataHandle newMetadata = new DocumentMetadataHandle();
if (op.getMetadata() != null) {
DocumentMetadataHandle originalMetadata = (DocumentMetadataHandle) op.getMetadata();
newMetadata.setPermissions(originalMetadata.getPermissions());
newMetadata.setCollections(originalMetadata.getCollections());
newMetadata.setQuality(originalMetadata.getQuality());
newMetadata.setProperties(originalMetadata.getProperties());
newMetadata.getMetadataValues().putAll(originalMetadata.getMetadataValues());
}
newMetadata.getMetadataValues().put(fieldName, hash);
return new DocumentWriteOperationImpl(op.getUri(), newMetadata, op.getContent(), op.getTemporalDocumentURI());
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
* Copyright (c) 2010-2025 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
*/
package com.marklogic.client.datamovement.filter;

import com.marklogic.client.document.DocumentWriteOperation;
import com.marklogic.client.document.DocumentWriteSet;
import com.marklogic.client.row.RowTemplate;

import java.util.HashMap;
import java.util.Map;
import java.util.function.Consumer;

/**
* Uses an Optic query to get the existing hash values for a set of URIs.
*
* @since 8.1.0
*/
class IncrementalWriteOpticFilter extends IncrementalWriteFilter {

IncrementalWriteOpticFilter(String fieldName, boolean canonicalizeJson, Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer) {
super(fieldName, canonicalizeJson, skippedDocumentsConsumer);
}

@Override
public DocumentWriteSet apply(Context context) {
final String[] uris = context.getDocumentWriteSet().stream()
.filter(op -> DocumentWriteOperation.OperationType.DOCUMENT_WRITE.equals(op.getOperationType()))
.map(DocumentWriteOperation::getUri)
.toArray(String[]::new);

// It doesn't seem possible yet to use a DSL query and bind an array of strings to a "uris" param, so using
// a serialized query instead. That doesn't allow a user to override the query though.
Map<String, String> existingHashes = new RowTemplate(context.getDatabaseClient()).query(op ->
op.fromLexicons(Map.of(
"uri", op.cts.uriReference(),
"hash", op.cts.fieldReference(super.fieldName)
)).where(
op.cts.documentQuery(op.xs.stringSeq(uris))
),

rows -> {
Map<String, String> map = new HashMap<>();
rows.forEach(row -> {
String uri = row.getString("uri");
String existingHash = row.getString("hash");
map.put(uri, existingHash);
});
return map;
}
);

return filterDocuments(context, uri -> existingHashes.get(uri));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
/*
* Copyright (c) 2010-2025 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
*/
package com.marklogic.client.datamovement.filter;

import com.marklogic.client.document.DocumentWriteOperation;
import com.marklogic.client.impl.DocumentWriteOperationImpl;
import com.marklogic.client.io.DocumentMetadataHandle;
import com.marklogic.client.io.StringHandle;
import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;

/**
* Unit tests that make no connection to MarkLogic.
*/
class IncrementalWriteFilterTest {

/**
* Verifies that when a hash is added, a new metadata object is created so that a doc-specific hash field can be
* added without affecting any other document that might be sharing the same metadata object.
*/
@Test
void addHashToMetadata() {
DocumentMetadataHandle metadata = new DocumentMetadataHandle()
.withCollections("c1")
.withPermission("rest-reader", DocumentMetadataHandle.Capability.READ)
.withQuality(2)
.withProperty("prop1", "value1")
.withMetadataValue("meta1", "value1");

DocumentWriteOperation doc1 = new DocumentWriteOperationImpl("/1.xml", metadata, new StringHandle("<doc1/>"));
DocumentWriteOperation doc2 = new DocumentWriteOperationImpl("/2.xml", metadata, new StringHandle("<doc2/>"));

doc2 = IncrementalWriteFilter.addHashToMetadata(doc2, "theField", "abc123");

assertEquals(metadata, doc1.getMetadata(), "doc1 should still have the original metadata object");

DocumentMetadataHandle metadata2 = (DocumentMetadataHandle) doc2.getMetadata();
assertEquals("c1", metadata2.getCollections().iterator().next(), "collection should be preserved");
assertEquals(DocumentMetadataHandle.Capability.READ, metadata2.getPermissions().get("rest-reader").iterator().next(), "permission should be preserved");
assertEquals(2, metadata2.getQuality(), "quality should be preserved");
assertEquals("value1", metadata2.getProperties().get("prop1"), "property should be preserved");

assertEquals("value1", metadata2.getMetadataValues().get("meta1"), "metadata value should be preserved");
assertEquals("abc123", metadata2.getMetadataValues().get("theField"), "hash field should be added");
}
}
Loading