marklogic · rjrudin · Dec 10, 2025 · Copilot · Dec 30, 2025
@@ -37,6 +37,11 @@ dependencies {
 	implementation "com.fasterxml.jackson.core:jackson-databind:${jacksonVersion}"
 	implementation "com.fasterxml.jackson.dataformat:jackson-dataformat-csv:${jacksonVersion}"
 
+	// Dependencies for hash generation. Can be safely omitted if not using the incremental write feature. But neither
+	// has any transitive dependencies, and thus their impact on the dependency tree is minimal.
+	implementation "io.github.erdtman:java-json-canonicalization:1.1"
+	implementation "net.openhft:zero-allocation-hashing:0.27ea1"
+
 	// Only used by extras (which some examples then depend on)
 	compileOnly 'org.jdom:jdom2:2.0.6.1'
 	compileOnly 'org.dom4j:dom4j:2.2.0'

@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2010-2025 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
+ */
+package com.marklogic.client.datamovement;
+
+import com.marklogic.client.DatabaseClient;
+import com.marklogic.client.document.DocumentWriteSet;
+
+import java.util.function.Function;
+
+/**
+ * A filter that can modify a DocumentWriteSet before it is written to the database.
+ *
+ * @since 8.1.0
+ */
+public interface DocumentWriteSetFilter extends Function<DocumentWriteSetFilter.Context, DocumentWriteSet> {
+
+	interface Context {
+		/**
+		 * @return the DocumentWriteSet to be written
+		 */
+		DocumentWriteSet getDocumentWriteSet();
+
+		/**
+		 * @return the batch number
+		 */
+		long getBatchNumber();
+
+		/**
+		 * @return the DatabaseClient being used for this batch
+		 */
+		DatabaseClient getDatabaseClient();
+
+		/**
+		 * @return the temporal collection name, or null if not writing to a temporal collection
+		 */
+		String getTemporalCollection();
+	}
+}
@@ -357,4 +357,17 @@ WriteBatcher addAs(String uri, DocumentMetadataWriteHandle metadataHandle,
    * @param writeBatch the information about the batch that failed
    */
   void retryWithFailureListeners(WriteBatch writeBatch);
+
+  /**
+   * Sets a filter to modify or replace the DocumentWriteSet before it is written.
+   * The filter can return either the modified DocumentWriteSet or a new one.
+   * If the filter returns null or an empty DocumentWriteSet, no write will occur.
+   *
+   * @param filter the function to apply before writing
+   * @return this instance for method chaining
+   * @since 8.1.0
+   */
+  default WriteBatcher withDocumentWriteSetFilter(DocumentWriteSetFilter filter) {
+	  return this;
+  }
 }
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2010-2025 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
+ */
+package com.marklogic.client.datamovement.filter;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ArrayNode;
+import com.marklogic.client.datamovement.DocumentWriteSetFilter;
+import com.marklogic.client.document.DocumentWriteOperation;
+import com.marklogic.client.document.DocumentWriteSet;
+import com.marklogic.client.io.JacksonHandle;
+
+import java.util.function.Consumer;
+
+/**
+ * Uses server-side JavaScript code to get the existing hash values for a set of URIs.
+ *
+ * @since 8.1.0
+ */
+class IncrementalWriteEvalFilter extends IncrementalWriteFilter {
+
+	private static final String EVAL_SCRIPT = """
+		const tuples = cts.valueTuples([cts.uriReference(), cts.fieldReference(fieldName)], null, cts.documentQuery(uris));
+		const response = {};
+		for (var tuple of tuples) {
+		  response[tuple[0]] = tuple[1];
+		}
+		response
+		""";
+
+	IncrementalWriteEvalFilter(String fieldName, boolean canonicalizeJson, Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer) {
+		super(fieldName, canonicalizeJson, skippedDocumentsConsumer);
+	}
+
+	@Override
+	public DocumentWriteSet apply(DocumentWriteSetFilter.Context context) {
+		ArrayNode uris = new ObjectMapper().createArrayNode();
+		context.getDocumentWriteSet().stream().forEach(op -> {
+			if (DocumentWriteOperation.OperationType.DOCUMENT_WRITE.equals(op.getOperationType())) {
+				uris.add(op.getUri());
+			}
+		});
+
+		JsonNode response = context.getDatabaseClient().newServerEval().javascript(EVAL_SCRIPT)
+			.addVariable("fieldName", fieldName)
+			.addVariable("uris", new JacksonHandle(uris))
+			.evalAs(JsonNode.class);
+
+		return filterDocuments(context, uri -> {
+			if (response.has(uri)) {
+				return response.get(uri).asText();
+			}
+			return null;
+		});
+	}
+}
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2010-2025 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
+ */
+package com.marklogic.client.datamovement.filter;
+
+import com.marklogic.client.datamovement.DocumentWriteSetFilter;
+import com.marklogic.client.document.DocumentWriteOperation;
+import com.marklogic.client.document.DocumentWriteSet;
+import com.marklogic.client.impl.DocumentWriteOperationImpl;
+import com.marklogic.client.impl.HandleAccessor;
+import com.marklogic.client.io.BaseHandle;
+import com.marklogic.client.io.DocumentMetadataHandle;
+import com.marklogic.client.io.Format;
+import com.marklogic.client.io.marker.AbstractWriteHandle;
+import net.openhft.hashing.LongHashFunction;
+import org.erdtman.jcs.JsonCanonicalizer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.Consumer;
+import java.util.function.Function;
+
+/**
+ * A DocumentWriteSetFilter that skips writing documents whose content has not changed since the last write
+ * based on a hash value stored in a MarkLogic field.
+ *
+ * @since 8.1.0
+ */
+public abstract class IncrementalWriteFilter implements DocumentWriteSetFilter {
+
+	protected final Logger logger = LoggerFactory.getLogger(this.getClass());
+
+	public static Builder newBuilder() {
+		return new Builder();
+	}
+
+	public static class Builder {
+
+		private String fieldName = "incrementalWriteHash";
+		private boolean canonicalizeJson = true;
+		private boolean useEvalQuery = false;
+		private Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer;
+
+		/**
+		 * @param fieldName the name of the MarkLogic field that will hold the hash value; defaults to "incrementalWriteHash".
+		 */
+		public Builder fieldName(String fieldName) {
+			this.fieldName = fieldName;
+			return this;
+		}
+
+		/**
+		 * @param canonicalizeJson whether to canonicalize JSON content before hashing; defaults to true.
+		 *                         Delegates to https://github.com/erdtman/java-json-canonicalization for canonicalization.
+		 */
+		public Builder canonicalizeJson(boolean canonicalizeJson) {
+			this.canonicalizeJson = canonicalizeJson;
+			return this;
+		}
+
+		/**
+		 * @param useEvalQuery if true, evaluate server-side JavaScript instead of an Optic query for retrieving hash values; defaults to false.
+		 */
+		public Builder useEvalQuery(boolean useEvalQuery) {
+			this.useEvalQuery = useEvalQuery;
+			return this;
+		}
+
+		/**
+		 * @param skippedDocumentsConsumer a consumer that will be called with any documents in a batch that were skipped because their content had not changed.
+		 */
+		public Builder onDocumentsSkipped(Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer) {
+			this.skippedDocumentsConsumer = skippedDocumentsConsumer;
+			return this;
+		}
+
+		public IncrementalWriteFilter build() {
+			if (useEvalQuery) {
+				return new IncrementalWriteEvalFilter(fieldName, canonicalizeJson, skippedDocumentsConsumer);
+			}
+			return new IncrementalWriteOpticFilter(fieldName, canonicalizeJson, skippedDocumentsConsumer);
+		}
+	}
+
+	protected final String fieldName;
+	private final boolean canonicalizeJson;
+	private final Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer;
+
+	// Hardcoding this for now, with a good general purpose hashing function.
+	// See https://xxhash.com for benchmarks.
+	private final LongHashFunction hashFunction = LongHashFunction.xx3();
+
+	public IncrementalWriteFilter(String fieldName, boolean canonicalizeJson, Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer) {
+		this.fieldName = fieldName;
+		this.canonicalizeJson = canonicalizeJson;
+		this.skippedDocumentsConsumer = skippedDocumentsConsumer;
+	}
+
+	protected final DocumentWriteSet filterDocuments(Context context, Function<String, String> hashRetriever) {
+		final DocumentWriteSet newWriteSet = context.getDatabaseClient().newDocumentManager().newWriteSet();
+		final List<DocumentWriteOperation> skippedDocuments = new ArrayList<>();
+
+		for (DocumentWriteOperation doc : context.getDocumentWriteSet()) {
+			if (!DocumentWriteOperation.OperationType.DOCUMENT_WRITE.equals(doc.getOperationType())) {
+				newWriteSet.add(doc);
-				newWriteSet.add(doc);
+				newWriteSet.add(doc);
+				continue;
-				newWriteSet.add(doc);
+				newWriteSet.add(doc);
+				continue;
+				continue;
+			}
+
+			final String content = serializeContent(doc.getContent());
+			final String contentHash = computeHash(content);
+			final String existingHash = hashRetriever.apply(doc.getUri());
+			if (logger.isTraceEnabled()) {
+				logger.trace("URI: {}, existing Hash: {}, new Hash: {}", doc.getUri(), existingHash, contentHash);
+			}
+
+			if (existingHash != null) {
+				if (!existingHash.equals(contentHash)) {
+					newWriteSet.add(addHashToMetadata(doc, fieldName, contentHash));
+				} else if (skippedDocumentsConsumer != null) {
+					skippedDocuments.add(doc);
+				}
+			} else {
+				newWriteSet.add(addHashToMetadata(doc, fieldName, contentHash));
+			}
+		}
+
+		if (!skippedDocuments.isEmpty()) {
+			skippedDocumentsConsumer.accept(skippedDocuments.toArray(new DocumentWriteOperation[0]));
+		}
+
+		return newWriteSet;
+	}
+
+	private String serializeContent(AbstractWriteHandle contentHandle) {
+		String content = HandleAccessor.contentAsString(contentHandle);
+
+		Format format = null;
+		if (contentHandle instanceof BaseHandle<?, ?> baseHandle) {
+			format = baseHandle.getFormat();
+		}
+
+		if (canonicalizeJson && (Format.JSON.equals(format) || content.startsWith("{"))) {
+			JsonCanonicalizer jc;
+			try {
+				jc = new JsonCanonicalizer(content);
+			} catch (IOException e) {
+				throw new RuntimeException("Unable to parse JSON content, cause: " + e.getMessage(), e);
+			}
+			return jc.getEncodedString();
+		}
+
+		return content;
+	}
+
+	private String computeHash(String content) {
+		byte[] bytes = content.getBytes(StandardCharsets.UTF_8);
+		long hash = hashFunction.hashBytes(bytes);
+		return Long.toHexString(hash);
+	}
+
+	protected static DocumentWriteOperation addHashToMetadata(DocumentWriteOperation op, String fieldName, String hash) {
+		DocumentMetadataHandle newMetadata = new DocumentMetadataHandle();
+		if (op.getMetadata() != null) {
+			DocumentMetadataHandle originalMetadata = (DocumentMetadataHandle) op.getMetadata();
+			newMetadata.setPermissions(originalMetadata.getPermissions());
+			newMetadata.setCollections(originalMetadata.getCollections());
+			newMetadata.setQuality(originalMetadata.getQuality());
+			newMetadata.setProperties(originalMetadata.getProperties());
+			newMetadata.getMetadataValues().putAll(originalMetadata.getMetadataValues());
+		}
+		newMetadata.getMetadataValues().put(fieldName, hash);
+		return new DocumentWriteOperationImpl(op.getUri(), newMetadata, op.getContent(), op.getTemporalDocumentURI());
+	}
+}
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2010-2025 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
+ */
+package com.marklogic.client.datamovement.filter;
+
+import com.marklogic.client.document.DocumentWriteOperation;
+import com.marklogic.client.document.DocumentWriteSet;
+import com.marklogic.client.row.RowTemplate;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.function.Consumer;
+
+/**
+ * Uses an Optic query to get the existing hash values for a set of URIs.
+ *
+ * @since 8.1.0
+ */
+class IncrementalWriteOpticFilter extends IncrementalWriteFilter {
+
+	IncrementalWriteOpticFilter(String fieldName, boolean canonicalizeJson, Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer) {
+		super(fieldName, canonicalizeJson, skippedDocumentsConsumer);
+	}
+
+	@Override
+	public DocumentWriteSet apply(Context context) {
+		final String[] uris = context.getDocumentWriteSet().stream()
+			.filter(op -> DocumentWriteOperation.OperationType.DOCUMENT_WRITE.equals(op.getOperationType()))
+			.map(DocumentWriteOperation::getUri)
+			.toArray(String[]::new);
+
+		// It doesn't seem possible yet to use a DSL query and bind an array of strings to a "uris" param, so using
+		// a serialized query instead. That doesn't allow a user to override the query though.
+		Map<String, String> existingHashes = new RowTemplate(context.getDatabaseClient()).query(op ->
+				op.fromLexicons(Map.of(
+					"uri", op.cts.uriReference(),
+					"hash", op.cts.fieldReference(super.fieldName)
+				)).where(
+					op.cts.documentQuery(op.xs.stringSeq(uris))
+				),
+
+			rows -> {
+				Map<String, String> map = new HashMap<>();
+				rows.forEach(row -> {
+					String uri = row.getString("uri");
+					String existingHash = row.getString("hash");
+					map.put(uri, existingHash);
+				});
+				return map;
+			}
+		);
+
+		return filterDocuments(context, uri -> existingHashes.get(uri));
+	}
+}