Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions marklogic-client-api/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ dependencies {
implementation "com.fasterxml.jackson.core:jackson-databind:${jacksonVersion}"
implementation "com.fasterxml.jackson.dataformat:jackson-dataformat-csv:${jacksonVersion}"

// Dependencies for hash generation. Can be safely omitted if not using the incremental write feature. But neither
// has any transitive dependencies, and thus their impact on the dependency tree is minimal.
implementation "io.github.erdtman:java-json-canonicalization:1.1"
implementation "net.openhft:zero-allocation-hashing:0.27ea1"

// Only used by extras (which some examples then depend on)
compileOnly 'org.jdom:jdom2:2.0.6.1'
compileOnly 'org.dom4j:dom4j:2.2.0'
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/*
* Copyright (c) 2010-2025 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
*/
package com.marklogic.client.datamovement;

import com.marklogic.client.DatabaseClient;
import com.marklogic.client.document.DocumentWriteSet;

import java.util.function.Function;

/**
* A filter that can modify a DocumentWriteSet before it is written to the database.
*
* @since 8.1.0
*/
public interface DocumentWriteSetFilter extends Function<DocumentWriteSetFilter.Context, DocumentWriteSet> {

interface Context {
/**
* @return the DocumentWriteSet to be written
*/
DocumentWriteSet getDocumentWriteSet();

/**
* @return the batch number
*/
long getBatchNumber();

/**
* @return the DatabaseClient being used for this batch
*/
DatabaseClient getDatabaseClient();

/**
* @return the temporal collection name, or null if not writing to a temporal collection
*/
String getTemporalCollection();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -357,4 +357,17 @@ WriteBatcher addAs(String uri, DocumentMetadataWriteHandle metadataHandle,
* @param writeBatch the information about the batch that failed
*/
void retryWithFailureListeners(WriteBatch writeBatch);

/**
* Sets a filter to modify or replace the DocumentWriteSet before it is written.
* The filter can return either the modified DocumentWriteSet or a new one.
* If the filter returns null or an empty DocumentWriteSet, no write will occur.
*
* @param filter the function to apply before writing
* @return this instance for method chaining
* @since 8.1.0
*/
default WriteBatcher withDocumentWriteSetFilter(DocumentWriteSetFilter filter) {
return this;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/*
* Copyright (c) 2010-2025 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
*/
package com.marklogic.client.datamovement.filter;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ArrayNode;
import com.marklogic.client.datamovement.DocumentWriteSetFilter;
import com.marklogic.client.document.DocumentWriteOperation;
import com.marklogic.client.document.DocumentWriteSet;
import com.marklogic.client.io.JacksonHandle;

import java.util.function.Consumer;

/**
* Uses server-side JavaScript code to get the existing hash values for a set of URIs.
*
* @since 8.1.0
*/
class IncrementalWriteEvalFilter extends IncrementalWriteFilter {

private static final String EVAL_SCRIPT = """
const tuples = cts.valueTuples([cts.uriReference(), cts.fieldReference(fieldName)], null, cts.documentQuery(uris));
const response = {};
for (var tuple of tuples) {
response[tuple[0]] = tuple[1];
}
response
""";

IncrementalWriteEvalFilter(String fieldName, boolean canonicalizeJson, Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer) {
super(fieldName, canonicalizeJson, skippedDocumentsConsumer);
}

@Override
public DocumentWriteSet apply(DocumentWriteSetFilter.Context context) {
ArrayNode uris = new ObjectMapper().createArrayNode();
context.getDocumentWriteSet().stream().forEach(op -> {
if (DocumentWriteOperation.OperationType.DOCUMENT_WRITE.equals(op.getOperationType())) {
uris.add(op.getUri());
}
});

JsonNode response = context.getDatabaseClient().newServerEval().javascript(EVAL_SCRIPT)
.addVariable("fieldName", fieldName)
.addVariable("uris", new JacksonHandle(uris))
.evalAs(JsonNode.class);

return filterDocuments(context, uri -> {
if (response.has(uri)) {
return response.get(uri).asText();
}
return null;
});
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
/*
* Copyright (c) 2010-2025 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
*/
package com.marklogic.client.datamovement.filter;

import com.marklogic.client.datamovement.DocumentWriteSetFilter;
import com.marklogic.client.document.DocumentWriteOperation;
import com.marklogic.client.document.DocumentWriteSet;
import com.marklogic.client.impl.DocumentWriteOperationImpl;
import com.marklogic.client.impl.HandleAccessor;
import com.marklogic.client.io.BaseHandle;
import com.marklogic.client.io.DocumentMetadataHandle;
import com.marklogic.client.io.Format;
import com.marklogic.client.io.marker.AbstractWriteHandle;
import net.openhft.hashing.LongHashFunction;
import org.erdtman.jcs.JsonCanonicalizer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Consumer;
import java.util.function.Function;

/**
* A DocumentWriteSetFilter that skips writing documents whose content has not changed since the last write
* based on a hash value stored in a MarkLogic field.
*
* @since 8.1.0
*/
public abstract class IncrementalWriteFilter implements DocumentWriteSetFilter {

protected final Logger logger = LoggerFactory.getLogger(this.getClass());

public static Builder newBuilder() {
return new Builder();
}

public static class Builder {

private String fieldName = "incrementalWriteHash";
private boolean canonicalizeJson = true;
private boolean useEvalQuery = false;
private Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer;

/**
* @param fieldName the name of the MarkLogic field that will hold the hash value; defaults to "incrementalWriteHash".
*/
public Builder fieldName(String fieldName) {
this.fieldName = fieldName;
return this;
}

/**
* @param canonicalizeJson whether to canonicalize JSON content before hashing; defaults to true.
* Delegates to https://github.com/erdtman/java-json-canonicalization for canonicalization.
*/
public Builder canonicalizeJson(boolean canonicalizeJson) {
this.canonicalizeJson = canonicalizeJson;
return this;
}

/**
* @param useEvalQuery if true, evaluate server-side JavaScript instead of an Optic query for retrieving hash values; defaults to false.
*/
public Builder useEvalQuery(boolean useEvalQuery) {
this.useEvalQuery = useEvalQuery;
return this;
}

/**
* @param skippedDocumentsConsumer a consumer that will be called with any documents in a batch that were skipped because their content had not changed.
*/
public Builder onDocumentsSkipped(Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer) {
this.skippedDocumentsConsumer = skippedDocumentsConsumer;
return this;
}

public IncrementalWriteFilter build() {
if (useEvalQuery) {
return new IncrementalWriteEvalFilter(fieldName, canonicalizeJson, skippedDocumentsConsumer);
}
return new IncrementalWriteOpticFilter(fieldName, canonicalizeJson, skippedDocumentsConsumer);
}
}

protected final String fieldName;
private final boolean canonicalizeJson;
private final Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer;

// Hardcoding this for now, with a good general purpose hashing function.
// See https://xxhash.com for benchmarks.
private final LongHashFunction hashFunction = LongHashFunction.xx3();

public IncrementalWriteFilter(String fieldName, boolean canonicalizeJson, Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer) {
this.fieldName = fieldName;
this.canonicalizeJson = canonicalizeJson;
this.skippedDocumentsConsumer = skippedDocumentsConsumer;
}

protected final DocumentWriteSet filterDocuments(Context context, Function<String, String> hashRetriever) {
final DocumentWriteSet newWriteSet = context.getDatabaseClient().newDocumentManager().newWriteSet();
final List<DocumentWriteOperation> skippedDocuments = new ArrayList<>();

for (DocumentWriteOperation doc : context.getDocumentWriteSet()) {
if (!DocumentWriteOperation.OperationType.DOCUMENT_WRITE.equals(doc.getOperationType())) {
newWriteSet.add(doc);
Copy link

Copilot AI Dec 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Logic error: non-DOCUMENT_WRITE operations are added to newWriteSet but then processing continues for all documents. This causes non-DOCUMENT_WRITE operations to be processed for hashing when they should be skipped. The continue statement is missing after adding non-DOCUMENT_WRITE operations.

Suggested change
newWriteSet.add(doc);
newWriteSet.add(doc);
continue;

Copilot uses AI. Check for mistakes.
continue;
}

final String content = serializeContent(doc.getContent());
final String contentHash = computeHash(content);
final String existingHash = hashRetriever.apply(doc.getUri());
if (logger.isTraceEnabled()) {
logger.trace("URI: {}, existing Hash: {}, new Hash: {}", doc.getUri(), existingHash, contentHash);
}

if (existingHash != null) {
if (!existingHash.equals(contentHash)) {
newWriteSet.add(addHashToMetadata(doc, fieldName, contentHash));
} else if (skippedDocumentsConsumer != null) {
skippedDocuments.add(doc);
}
} else {
newWriteSet.add(addHashToMetadata(doc, fieldName, contentHash));
}
}

if (!skippedDocuments.isEmpty()) {
skippedDocumentsConsumer.accept(skippedDocuments.toArray(new DocumentWriteOperation[0]));
}

return newWriteSet;
}

private String serializeContent(AbstractWriteHandle contentHandle) {
String content = HandleAccessor.contentAsString(contentHandle);

Format format = null;
if (contentHandle instanceof BaseHandle<?, ?> baseHandle) {
format = baseHandle.getFormat();
}

if (canonicalizeJson && (Format.JSON.equals(format) || content.startsWith("{"))) {
JsonCanonicalizer jc;
try {
jc = new JsonCanonicalizer(content);
} catch (IOException e) {
throw new RuntimeException("Unable to parse JSON content, cause: " + e.getMessage(), e);
}
return jc.getEncodedString();
}

return content;
}

private String computeHash(String content) {
byte[] bytes = content.getBytes(StandardCharsets.UTF_8);
long hash = hashFunction.hashBytes(bytes);
return Long.toHexString(hash);
}

protected static DocumentWriteOperation addHashToMetadata(DocumentWriteOperation op, String fieldName, String hash) {
DocumentMetadataHandle newMetadata = new DocumentMetadataHandle();
if (op.getMetadata() != null) {
DocumentMetadataHandle originalMetadata = (DocumentMetadataHandle) op.getMetadata();
newMetadata.setPermissions(originalMetadata.getPermissions());
newMetadata.setCollections(originalMetadata.getCollections());
newMetadata.setQuality(originalMetadata.getQuality());
newMetadata.setProperties(originalMetadata.getProperties());
newMetadata.getMetadataValues().putAll(originalMetadata.getMetadataValues());
}
newMetadata.getMetadataValues().put(fieldName, hash);
return new DocumentWriteOperationImpl(op.getUri(), newMetadata, op.getContent(), op.getTemporalDocumentURI());
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
* Copyright (c) 2010-2025 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
*/
package com.marklogic.client.datamovement.filter;

import com.marklogic.client.document.DocumentWriteOperation;
import com.marklogic.client.document.DocumentWriteSet;
import com.marklogic.client.row.RowTemplate;

import java.util.HashMap;
import java.util.Map;
import java.util.function.Consumer;

/**
* Uses an Optic query to get the existing hash values for a set of URIs.
*
* @since 8.1.0
*/
class IncrementalWriteOpticFilter extends IncrementalWriteFilter {

IncrementalWriteOpticFilter(String fieldName, boolean canonicalizeJson, Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer) {
super(fieldName, canonicalizeJson, skippedDocumentsConsumer);
}

@Override
public DocumentWriteSet apply(Context context) {
final String[] uris = context.getDocumentWriteSet().stream()
.filter(op -> DocumentWriteOperation.OperationType.DOCUMENT_WRITE.equals(op.getOperationType()))
.map(DocumentWriteOperation::getUri)
.toArray(String[]::new);

// It doesn't seem possible yet to use a DSL query and bind an array of strings to a "uris" param, so using
// a serialized query instead. That doesn't allow a user to override the query though.
Map<String, String> existingHashes = new RowTemplate(context.getDatabaseClient()).query(op ->
op.fromLexicons(Map.of(
"uri", op.cts.uriReference(),
"hash", op.cts.fieldReference(super.fieldName)
)).where(
op.cts.documentQuery(op.xs.stringSeq(uris))
),

rows -> {
Map<String, String> map = new HashMap<>();
rows.forEach(row -> {
String uri = row.getString("uri");
String existingHash = row.getString("hash");
map.put(uri, existingHash);
});
return map;
}
);

return filterDocuments(context, uri -> existingHashes.get(uri));
}
}
Loading
Loading