From 14acbf2969c100fe7a609e0c0b2a3ef7bf2ec0c3 Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Thu, 1 Jan 2026 13:13:33 +0800 Subject: [PATCH 1/2] DAOS-18367 vos: properly evict object for failed transaction Currently, if a transaction failed for some reason, the cleanup logic will try to evict related vos object from cache to avoid leaving stable information in cache. Such logic works well for the system with PMEM. But under md-on-ssd mode, the eviction may cause trouble. Because one vos modification may hold the same object multiple times, and there is CPU yield during these object hold actions. That creates race windows for other concurrent operations against the same object. This patch changes the logic: when the transaction changes some vos object(s), it will record related oid(s), if such transaction failed in subsequent process, it will only evict these modified objects. The others in cache will not be affected during transaction cleanup. On the other hand, under md-on-ssd mode, CPU may yield during backend TX start, the object that is held by current modification maybe marked as evicted in such race windows. So add logic to check whether related object is evicted or not after backend TX started, if yes, then restart current transaction. Allow-unstable-test: true Signed-off-by: Fan Yong --- src/dtx/dtx_common.c | 191 +++++---------------------------- src/dtx/tests/dts_structs.c | 9 +- src/include/daos/lru.h | 11 +- src/include/daos_srv/dtx_srv.h | 35 +++--- src/vos/tests/vts_dtx.c | 41 ++----- src/vos/vos_common.c | 32 ++++-- src/vos/vos_dtx.c | 137 +++++++++++++---------- src/vos/vos_internal.h | 29 ++--- src/vos/vos_io.c | 46 +++----- src/vos/vos_obj.c | 34 ++++-- src/vos/vos_obj.h | 7 ++ 11 files changed, 216 insertions(+), 356 deletions(-) diff --git a/src/dtx/dtx_common.c b/src/dtx/dtx_common.c index 175f440dc4e..ffb17279d34 100644 --- a/src/dtx/dtx_common.c +++ b/src/dtx/dtx_common.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -883,15 +883,10 @@ dtx_handle_reinit(struct dtx_handle *dth) dth->dth_modify_shared = 0; dth->dth_active = 0; - dth->dth_touched_leader_oid = 0; dth->dth_local_tx_started = 0; dth->dth_cos_done = 0; - - dth->dth_op_seq = 0; - dth->dth_oid_cnt = 0; - dth->dth_oid_cap = 0; - D_FREE(dth->dth_oid_array); - dth->dth_dkey_hash = 0; + dth->dth_op_seq = 0; + dth->dth_dkey_hash = 0; vos_dtx_rsrvd_fini(dth); return vos_dtx_rsrvd_init(dth); @@ -926,32 +921,29 @@ dtx_handle_init(struct dtx_id *dti, daos_handle_t xoh, struct dtx_epoch *epoch, dth->dth_coh = xoh; } - dth->dth_ver = pm_ver; - dth->dth_refs = 1; - dth->dth_mbs = mbs; - - dth->dth_pinned = 0; - dth->dth_cos_done = 0; - dth->dth_modify_shared = 0; - dth->dth_active = 0; - dth->dth_touched_leader_oid = 0; - dth->dth_local_tx_started = 0; - dth->dth_solo = (flags & DTX_SOLO) ? 1 : 0; - dth->dth_drop_cmt = (flags & DTX_DROP_CMT) ? 1 : 0; - dth->dth_dist = (flags & DTX_DIST) ? 1 : 0; - dth->dth_for_migration = (flags & DTX_FOR_MIGRATION) ? 1 : 0; + dth->dth_ver = pm_ver; + dth->dth_refs = 1; + dth->dth_mbs = mbs; + dth->dth_pinned = 0; + dth->dth_cos_done = 0; + dth->dth_modify_shared = 0; + dth->dth_active = 0; + dth->dth_local_tx_started = 0; + dth->dth_solo = (flags & DTX_SOLO) ? 1 : 0; + dth->dth_drop_cmt = (flags & DTX_DROP_CMT) ? 1 : 0; + dth->dth_dist = (flags & DTX_DIST) ? 1 : 0; + dth->dth_for_migration = (flags & DTX_FOR_MIGRATION) ? 1 : 0; dth->dth_ignore_uncommitted = (flags & DTX_IGNORE_UNCOMMITTED) ? 1 : 0; - dth->dth_prepared = (flags & DTX_PREPARED) ? 1 : 0; - dth->dth_epoch_owner = (flags & DTX_EPOCH_OWNER) ? 1 : 0; - dth->dth_aborted = 0; - dth->dth_already = 0; - dth->dth_need_validation = 0; + dth->dth_prepared = (flags & DTX_PREPARED) ? 1 : 0; + dth->dth_epoch_owner = (flags & DTX_EPOCH_OWNER) ? 1 : 0; + dth->dth_aborted = 0; + dth->dth_already = 0; + dth->dth_need_validation = 0; dth->dth_local = (flags & DTX_LOCAL) ? 1 : 0; - - dth->dth_dti_cos = dti_cos; - dth->dth_dti_cos_count = dti_cos_cnt; - dth->dth_ent = NULL; - dth->dth_flags = leader ? DTE_LEADER : 0; + dth->dth_dti_cos = dti_cos; + dth->dth_dti_cos_count = dti_cos_cnt; + dth->dth_ent = NULL; + dth->dth_flags = leader ? DTE_LEADER : 0; if (flags & DTX_SYNC) { dth->dth_flags |= DTE_BLOCK; @@ -960,12 +952,11 @@ dtx_handle_init(struct dtx_id *dti, daos_handle_t xoh, struct dtx_epoch *epoch, dth->dth_sync = 0; } - dth->dth_op_seq = 0; - dth->dth_oid_cnt = 0; - dth->dth_oid_cap = 0; - dth->dth_oid_array = NULL; - - dth->dth_dkey_hash = 0; + dth->dth_op_seq = 0; + dth->dth_local_oid_cnt = 0; + dth->dth_local_oid_cap = 0; + dth->dth_local_oid_array = NULL; + dth->dth_dkey_hash = 0; if (!(flags & DTX_LOCAL)) { if (daos_is_zero_dti(dti)) @@ -1001,83 +992,6 @@ dtx_handle_init(struct dtx_id *dti, daos_handle_t xoh, struct dtx_epoch *epoch, return rc; } -static int -dtx_insert_oid(struct dtx_handle *dth, daos_unit_oid_t *oid, bool touch_leader) -{ - int start = 0; - int end = dth->dth_oid_cnt - 1; - int at; - int rc = 0; - - do { - at = (start + end) / 2; - rc = daos_unit_oid_compare(dth->dth_oid_array[at], *oid); - if (rc == 0) - return 0; - - if (rc > 0) - end = at - 1; - else - start = at + 1; - } while (start <= end); - - if (dth->dth_oid_cnt == dth->dth_oid_cap) { - daos_unit_oid_t *oid_array; - - D_ALLOC_ARRAY(oid_array, dth->dth_oid_cap << 1); - if (oid_array == NULL) - return -DER_NOMEM; - - if (rc > 0) { - /* Insert before dth->dth_oid_array[at]. */ - if (at > 0) - memcpy(&oid_array[0], &dth->dth_oid_array[0], - sizeof(*oid) * at); - oid_array[at] = *oid; - memcpy(&oid_array[at + 1], &dth->dth_oid_array[at], - sizeof(*oid) * (dth->dth_oid_cnt - at)); - } else { - /* Insert after dth->dth_oid_array[at]. */ - memcpy(&oid_array[0], &dth->dth_oid_array[0], - sizeof(*oid) * (at + 1)); - oid_array[at + 1] = *oid; - if (at < dth->dth_oid_cnt - 1) - memcpy(&oid_array[at + 2], - &dth->dth_oid_array[at + 1], - sizeof(*oid) * (dth->dth_oid_cnt - 1 - at)); - } - - D_FREE(dth->dth_oid_array); - dth->dth_oid_array = oid_array; - dth->dth_oid_cap <<= 1; - - goto out; - } - - if (rc > 0) { - /* Insert before dth->dth_oid_array[at]. */ - memmove(&dth->dth_oid_array[at + 1], - &dth->dth_oid_array[at], - sizeof(*oid) * (dth->dth_oid_cnt - at)); - dth->dth_oid_array[at] = *oid; - } else { - /* Insert after dth->dth_oid_array[at]. */ - if (at < dth->dth_oid_cnt - 1) - memmove(&dth->dth_oid_array[at + 2], - &dth->dth_oid_array[at + 1], - sizeof(*oid) * (dth->dth_oid_cnt - 1 - at)); - dth->dth_oid_array[at + 1] = *oid; - } - -out: - if (touch_leader) - dth->dth_touched_leader_oid = 1; - - dth->dth_oid_cnt++; - - return 0; -} - void dtx_renew_epoch(struct dtx_epoch *epoch, struct dtx_handle *dth) { @@ -1110,51 +1024,6 @@ dtx_sub_init(struct dtx_handle *dth, daos_unit_oid_t *oid, uint64_t dkey_hash) dth->dth_dkey_hash = dkey_hash; dth->dth_op_seq++; - rc = daos_unit_oid_compare(dth->dth_leader_oid, *oid); - if (rc == 0) { - if (dth->dth_oid_array == NULL) - dth->dth_touched_leader_oid = 1; - - if (dth->dth_touched_leader_oid) - goto out; - - rc = dtx_insert_oid(dth, oid, true); - - D_GOTO(out, rc); - } - - if (dth->dth_oid_array == NULL) { - D_ASSERT(dth->dth_oid_cnt == 0); - - /* 4 slots by default to hold rename case. */ - dth->dth_oid_cap = 4; - D_ALLOC_ARRAY(dth->dth_oid_array, dth->dth_oid_cap); - if (dth->dth_oid_array == NULL) - D_GOTO(out, rc = -DER_NOMEM); - - if (!dth->dth_touched_leader_oid) { - dth->dth_oid_array[0] = *oid; - dth->dth_oid_cnt = 1; - - D_GOTO(out, rc = 0); - } - - dth->dth_oid_cnt = 2; - - if (rc > 0) { - dth->dth_oid_array[0] = *oid; - dth->dth_oid_array[1] = dth->dth_leader_oid; - } else { - dth->dth_oid_array[0] = dth->dth_leader_oid; - dth->dth_oid_array[1] = *oid; - } - - D_GOTO(out, rc = 0); - } - - rc = dtx_insert_oid(dth, oid, false); - -out: D_DEBUG(DB_IO, "Sub init DTX "DF_DTI" for object "DF_UOID " dkey %lu, opc seq %d: "DF_RC"\n", DP_DTI(&dth->dth_xid), DP_UOID(*oid), @@ -1493,7 +1362,6 @@ dtx_leader_end(struct dtx_leader_handle *dlh, struct ds_cont_child *cont, int re dth->dth_sync ? "sync" : "async", dth->dth_dti_cos_count, dth->dth_cos_done ? dth->dth_dti_cos_count : 0, DP_RC(result)); - D_FREE(dth->dth_oid_array); D_FREE(dlh); d_tm_dec_gauge(dtx_tls_get()->dt_dtx_leader_total, 1); @@ -1617,7 +1485,6 @@ dtx_end(struct dtx_handle *dth, struct ds_cont_child *cont, int result) vos_dtx_detach(dth); out: - D_FREE(dth->dth_oid_array); D_FREE(dth); return result; diff --git a/src/dtx/tests/dts_structs.c b/src/dtx/tests/dts_structs.c index bddfdf9816c..0e656e33f2c 100644 --- a/src/dtx/tests/dts_structs.c +++ b/src/dtx/tests/dts_structs.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP. + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP. * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -62,7 +62,6 @@ struct_dtx_handle(void **state) SET_BITFIELD_1(dummy, dth_drop_cmt); SET_BITFIELD_1(dummy, dth_modify_shared); SET_BITFIELD_1(dummy, dth_active); - SET_BITFIELD_1(dummy, dth_touched_leader_oid); SET_BITFIELD_1(dummy, dth_local_tx_started); SET_BITFIELD_1(dummy, dth_shares_inited); SET_BITFIELD_1(dummy, dth_dist); @@ -75,7 +74,7 @@ struct_dtx_handle(void **state) SET_BITFIELD_1(dummy, dth_local); SET_BITFIELD_1(dummy, dth_epoch_owner); SET_BITFIELD_1(dummy, dth_local_complete); - SET_BITFIELD(dummy, padding1, 12); + SET_BITFIELD(dummy, padding1, 13); SET_FIELD(dummy, dth_dti_cos_count); SET_FIELD(dummy, dth_dti_cos); @@ -87,10 +86,6 @@ struct_dtx_handle(void **state) SET_FIELD(dummy, dth_op_seq); SET_FIELD(dummy, dth_deferred_used_cnt); SET_FIELD(dummy, padding2); - SET_FIELD(dummy, dth_oid_cnt); - SET_FIELD(dummy, dth_oid_cap); - SET_FIELD(dummy, padding3); - SET_FIELD(dummy, dth_oid_array); SET_FIELD(dummy, dth_local_oid_cnt); SET_FIELD(dummy, dth_local_oid_cap); SET_FIELD(dummy, padding4); diff --git a/src/include/daos/lru.h b/src/include/daos/lru.h index de6c5a373b9..6b21d31a6f3 100644 --- a/src/include/daos/lru.h +++ b/src/include/daos/lru.h @@ -1,6 +1,6 @@ /* * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -137,6 +137,15 @@ daos_lru_ref_evict(struct daos_lru_cache *lcache, struct daos_llink *llink) d_hash_rec_evict_at(&lcache->dlc_htable, &llink->ll_link); } +/** + * Whether the item is evicted or not. + */ +static inline bool +daos_lru_is_evicted(struct daos_llink *llink) +{ + return llink->ll_evicted != 0; +} + /** * Evict the item from LRU before releasing the refcount on it, wait until * the caller is the last one holds refcount. diff --git a/src/include/daos_srv/dtx_srv.h b/src/include/daos_srv/dtx_srv.h index 6143ed9b350..873d59ef1b2 100644 --- a/src/include/daos_srv/dtx_srv.h +++ b/src/include/daos_srv/dtx_srv.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -47,6 +47,7 @@ struct dtx_local_oid_record { * the most optimal way (packed). Please make sure that all necessary padding * is explicit so it could be used in the future. */ +/* clang-format off */ struct dtx_handle { union { struct dtx_entry dth_dte; @@ -92,8 +93,6 @@ struct dtx_handle { dth_modify_shared : 1, /* The DTX entry is in active table. */ dth_active : 1, - /* Leader oid is touched. */ - dth_touched_leader_oid : 1, /* Local TX is started. */ dth_local_tx_started : 1, /* The DTX share lists are inited. */ @@ -117,7 +116,7 @@ struct dtx_handle { /* Locally generate the epoch. */ dth_epoch_owner : 1, /* Flag to commit the local transaction */ - dth_local_complete : 1, padding1 : 12; + dth_local_complete : 1, padding1 : 13; /* The count the DTXs in the dth_dti_cos array. */ uint32_t dth_dti_cos_count; @@ -138,25 +137,14 @@ struct dtx_handle { uint16_t dth_deferred_used_cnt; uint16_t padding2; - union { - struct { - /** The count of objects that are modified by this DTX. */ - uint16_t dth_oid_cnt; - /** The total slots in the dth_oid_array. */ - uint16_t dth_oid_cap; - uint32_t padding3; - /** If more than one objects are modified, the IDs are reocrded here. */ - daos_unit_oid_t *dth_oid_array; - }; - struct { - /** The count of objects stored in dth_local_oid_array. */ - uint16_t dth_local_oid_cnt; - /** The total slots in the dth_local_oid_array. */ - uint16_t dth_local_oid_cap; - uint32_t padding4; - /** The record of all objects touched by the local transaction. */ - struct dtx_local_oid_record *dth_local_oid_array; - }; + struct { + /** The count of objects stored in dth_local_oid_array. */ + uint16_t dth_local_oid_cnt; + /** The total slots in the dth_local_oid_array. */ + uint16_t dth_local_oid_cap; + uint32_t padding4; + /** The record of all objects touched by the local transaction. */ + struct dtx_local_oid_record *dth_local_oid_array; }; /* Hash of the dkey to be modified if applicable. Per modification. */ @@ -179,6 +167,7 @@ struct dtx_handle { int dth_share_tbd_count; uint32_t padding5; }; +/* clang-format on */ /* Each sub transaction handle to manage each sub thandle */ struct dtx_sub_status { diff --git a/src/vos/tests/vts_dtx.c b/src/vos/tests/vts_dtx.c index 12cd6d72728..57d80412c96 100644 --- a/src/vos/tests/vts_dtx.c +++ b/src/vos/tests/vts_dtx.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -51,40 +51,13 @@ vts_dtx_begin(const daos_unit_oid_t *oid, daos_handle_t coh, daos_epoch_t epoch, vts_init_dte(&dth->dth_dte); - dth->dth_coh = coh; - dth->dth_epoch = epoch; - dth->dth_leader_oid = *oid; - - dth->dth_pinned = 0; - dth->dth_sync = 0; - dth->dth_cos_done = 0; - dth->dth_touched_leader_oid = 0; - dth->dth_local_tx_started = 0; - dth->dth_solo = 0; - dth->dth_drop_cmt = 0; - dth->dth_modify_shared = 0; - dth->dth_active = 0; - dth->dth_dist = 0; - dth->dth_for_migration = 0; - dth->dth_ignore_uncommitted = 0; - dth->dth_prepared = 0; - dth->dth_epoch_owner = 0; - dth->dth_aborted = 0; - dth->dth_already = 0; - dth->dth_need_validation = 0; - - dth->dth_dti_cos_count = 0; - dth->dth_dti_cos = NULL; - dth->dth_ent = NULL; - dth->dth_flags = DTE_LEADER; + dth->dth_coh = coh; + dth->dth_epoch = epoch; + dth->dth_leader_oid = *oid; + dth->dth_flags = DTE_LEADER; dth->dth_modification_cnt = 1; - - dth->dth_op_seq = 1; - dth->dth_oid_cnt = 0; - dth->dth_oid_cap = 0; - dth->dth_oid_array = NULL; - - dth->dth_dkey_hash = dkey_hash; + dth->dth_op_seq = 1; + dth->dth_dkey_hash = dkey_hash; D_INIT_LIST_HEAD(&dth->dth_share_cmt_list); D_INIT_LIST_HEAD(&dth->dth_share_abt_list); diff --git a/src/vos/vos_common.c b/src/vos/vos_common.c index a7397a94256..0a154bf5cad 100644 --- a/src/vos/vos_common.c +++ b/src/vos/vos_common.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent @@ -215,12 +215,22 @@ vos_tx_publish(struct dtx_handle *dth, bool publish) } int -vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb) +vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb, + struct vos_object *obj) { int rc; - if (dth == NULL) - return umem_tx_begin(umm, vos_txd_get(is_sysdb)); + if (dth == NULL) { + /* CPU may yield when umem_tx_begin, related object maybe evicted during that. */ + rc = umem_tx_begin(umm, vos_txd_get(is_sysdb)); + if (rc == 0 && unlikely(vos_obj_is_evicted(obj))) { + D_DEBUG(DB_IO, "Obj " DF_UOID " is evicted(1), need to restart TX.\n", + DP_UOID(obj->obj_id)); + rc = umem_tx_end(umm, -DER_TX_RESTART); + } + + return rc; + } D_ASSERT(!is_sysdb); /** Note: On successful return, dth tls gets set and will be cleared by the corresponding @@ -235,6 +245,14 @@ vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb) rc = umem_tx_begin(umm, vos_txd_get(is_sysdb)); if (rc == 0) { + /* CPU may yield when umem_tx_begin, related object maybe evicted during that. */ + if (unlikely(vos_obj_is_evicted(obj))) { + D_DEBUG(DB_IO, "Obj " DF_UOID " is evicted(2), need to restart TX.\n", + DP_UOID(obj->obj_id)); + + return umem_tx_end(umm, -DER_TX_RESTART); + } + dth->dth_local_tx_started = 1; vos_dth_set(dth, false); } @@ -250,12 +268,6 @@ vos_local_tx_abort(struct dtx_handle *dth) if (dth->dth_local_oid_cnt == 0) return; - /** - * Since a local transaction spawns always a single pool an eaither one of the containers - * can be used to access the pool. - */ - record = &dth->dth_local_oid_array[0]; - /** * Evict all objects touched by the aborted transaction from the object cache to make sure * no invalid pointer stays there. Not all of the touched objects have to be evicted but diff --git a/src/vos/vos_dtx.c b/src/vos/vos_dtx.c index c64adc12bb0..9eff6c89dd1 100644 --- a/src/vos/vos_dtx.c +++ b/src/vos/vos_dtx.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2019-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent @@ -151,37 +151,21 @@ dtx_inprogress(struct vos_dtx_act_ent *dae, struct dtx_handle *dth, } static void -dtx_act_ent_cleanup(struct vos_container *cont, struct vos_dtx_act_ent *dae, - struct dtx_handle *dth, bool evict, bool keep_df) +dtx_act_ent_cleanup(struct vos_container *cont, struct vos_dtx_act_ent *dae, bool evict, + bool keep_df) { - if (evict) { - daos_unit_oid_t *oids; - int count; - int i; + if (evict && dae->dae_oids != NULL) { + int i; - if (dth != NULL) { - if (dth->dth_oid_array != NULL) { - D_ASSERT(dth->dth_oid_cnt > 0); - - count = dth->dth_oid_cnt; - oids = dth->dth_oid_array; - } else { - count = 1; - oids = &dth->dth_leader_oid; - } - } else { - count = dae->dae_oid_cnt; - oids = dae->dae_oids; - } - - for (i = 0; i < count; i++) - vos_obj_evict_by_oid(cont, oids[i]); + for (i = 0; i < dae->dae_oid_cnt; i++) + vos_obj_evict_by_oid(cont, dae->dae_oids[i]); } if (dae->dae_oids != NULL && dae->dae_oids != &dae->dae_oid_inline && dae->dae_oids != &DAE_OID(dae)) { D_FREE(dae->dae_oids); dae->dae_oid_cnt = 0; + dae->dae_oid_cap = 0; } DAE_REC_OFF(dae) = UMOFF_NULL; @@ -254,7 +238,7 @@ dtx_act_ent_free(struct btr_instance *tins, struct btr_record *rec, D_ASSERT(dae != NULL); *(struct vos_dtx_act_ent **)args = dae; } else if (dae != NULL) { - dtx_act_ent_cleanup(tins->ti_priv, dae, NULL, true, false); + dtx_act_ent_cleanup(tins->ti_priv, dae, true, false); } return 0; @@ -879,7 +863,7 @@ vos_dtx_commit_one(struct vos_container *cont, struct dtx_id *dti, daos_epoch_t rc = dbtree_delete(cont->vc_dtx_active_hdl, BTR_PROBE_BYPASS, &kiov, &dae); if (rc == 0) { - dtx_act_ent_cleanup(cont, dae, NULL, false, false); + dtx_act_ent_cleanup(cont, dae, false, false); dtx_evict_lid(cont, dae); } @@ -1845,30 +1829,6 @@ vos_dtx_prepared(struct dtx_handle *dth, struct vos_dtx_cmt_ent **dce_p) (dth->dth_modification_cnt > 0)) dth->dth_sync = 1; - if (dth->dth_oid_array != NULL) { - D_ASSERT(dth->dth_oid_cnt > 0); - - dae->dae_oid_cnt = dth->dth_oid_cnt; - if (dth->dth_oid_cnt == 1) { - dae->dae_oid_inline = dth->dth_oid_array[0]; - dae->dae_oids = &dae->dae_oid_inline; - } else { - size = sizeof(daos_unit_oid_t) * dth->dth_oid_cnt; - D_ALLOC_NZ(dae->dae_oids, size); - if (dae->dae_oids == NULL) { - /* Not fatal. */ - D_WARN("No DRAM to store ACT DTX OIDs " - DF_DTI"\n", DP_DTI(&DAE_XID(dae))); - dae->dae_oid_cnt = 0; - } else { - memcpy(dae->dae_oids, dth->dth_oid_array, size); - } - } - } else { - dae->dae_oids = &DAE_OID(dae); - dae->dae_oid_cnt = 1; - } - if (DAE_MBS_DSIZE(dae) <= sizeof(DAE_MBS_INLINE(dae))) { memcpy(DAE_MBS_INLINE(dae), dth->dth_mbs->dm_data, DAE_MBS_DSIZE(dae)); @@ -2441,7 +2401,7 @@ vos_dtx_post_handle(struct vos_container *cont, DAE_FLAGS(daes[i]) |= DTE_PARTIAL_COMMITTED; daes[i]->dae_committing = 0; - dtx_act_ent_cleanup(cont, daes[i], NULL, false, true); + dtx_act_ent_cleanup(cont, daes[i], false, true); continue; } @@ -2467,13 +2427,13 @@ vos_dtx_post_handle(struct vos_container *cont, daes[i]->dae_aborted = 1; daes[i]->dae_aborting = 0; - dtx_act_ent_cleanup(cont, daes[i], NULL, true, false); + dtx_act_ent_cleanup(cont, daes[i], true, false); } else { D_ASSERT(daes[i]->dae_aborting == 0); daes[i]->dae_committed = 1; daes[i]->dae_committing = 0; - dtx_act_ent_cleanup(cont, daes[i], NULL, false, false); + dtx_act_ent_cleanup(cont, daes[i], false, false); } DAE_FLAGS(daes[i]) &= ~(DTE_CORRUPTED | DTE_ORPHAN | DTE_PARTIAL_COMMITTED); } @@ -3659,7 +3619,7 @@ vos_dtx_cleanup_internal(struct dtx_handle *dth) */ if (dae != NULL) { D_ASSERT(!vos_dae_is_prepare(dae)); - dtx_act_ent_cleanup(cont, dae, dth, true, false); + dtx_act_ent_cleanup(cont, dae, true, false); } } else { d_iov_set(&kiov, &dth->dth_xid, sizeof(dth->dth_xid)); @@ -3682,7 +3642,7 @@ vos_dtx_cleanup_internal(struct dtx_handle *dth) if (DAE_EPOCH(dae) != dth->dth_epoch) goto out; - dtx_act_ent_cleanup(cont, dae, dth, true, false); + dtx_act_ent_cleanup(cont, dae, true, false); rc = dbtree_delete(cont->vc_dtx_active_hdl, riov.iov_buf != NULL ? BTR_PROBE_BYPASS : BTR_PROBE_EQ, @@ -4040,7 +4000,7 @@ vos_dtx_local_begin(struct dtx_handle *dth, daos_handle_t poh) goto error; } - rc = vos_tx_begin(dth, umm, pool->vp_sysdb); + rc = vos_tx_begin(dth, umm, pool->vp_sysdb, NULL); if (rc != 0) { D_ERROR("Failed to start transaction: rc=" DF_RC "\n", DP_RC(rc)); goto error; @@ -4167,3 +4127,68 @@ vos_dtx_get_cmt_stat(daos_handle_t coh, uint64_t *cmt_cnt, struct dtx_time_stat out: return rc; } + +int +vos_dtx_record_oid(struct dtx_handle *dth, struct vos_container *cont, daos_unit_oid_t oid) +{ + struct dtx_local_oid_record *oid_array; + struct dtx_local_oid_record *record; + struct vos_dtx_act_ent *dae; + daos_unit_oid_t *oids; + int rc = 0; + + if (dth == NULL) + D_GOTO(out, rc = 0); + + if (dth->dth_local) { + if (dth->dth_local_oid_cnt == dth->dth_local_oid_cap) { + D_REALLOC_ARRAY(oid_array, dth->dth_local_oid_array, dth->dth_local_oid_cap, + dth->dth_local_oid_cap << 1); + if (oid_array == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + dth->dth_local_oid_array = oid_array; + dth->dth_local_oid_cap <<= 1; + } + + record = &dth->dth_local_oid_array[dth->dth_local_oid_cnt]; + record->dor_cont = cont; + vos_cont_addref(cont); + record->dor_oid = oid; + dth->dth_local_oid_cnt++; + + D_GOTO(out, rc = 0); + } + + if (daos_is_zero_dti(&dth->dth_xid)) + D_GOTO(out, rc = 0); + + dae = dth->dth_ent; + D_ASSERT(dae != NULL); + + if (dae->dae_oid_cnt == 0) { + if (daos_unit_oid_compare(oid, DAE_OID(dae)) == 0) + dae->dae_oids = &DAE_OID(dae); + else + dae->dae_oids = &dae->dae_oid_inline; + } else if (dae->dae_oid_cnt >= dae->dae_oid_cap) { + D_ALLOC_ARRAY(oids, dae->dae_oid_cnt << 1); + if (oids == NULL) + D_GOTO(out, rc = -DER_NOMEM); + + memcpy(oids, dae->dae_oids, sizeof(*oids) * dae->dae_oid_cnt); + if (dae->dae_oids != &DAE_OID(dae) && dae->dae_oids != &dae->dae_oid_inline) + D_FREE(dae->dae_oids); + + dae->dae_oids = oids; + dae->dae_oid_cap = dae->dae_oid_cnt << 1; + } + + dae->dae_oids[dae->dae_oid_cnt++] = oid; + +out: + if (rc != 0) + D_ERROR("Failed to record oid " DF_UOID ": " DF_RC "\n", DP_UOID(oid), DP_RC(rc)); + + return rc; +} diff --git a/src/vos/vos_internal.h b/src/vos/vos_internal.h index 18e6438ce6e..428051203ed 100644 --- a/src/vos/vos_internal.h +++ b/src/vos/vos_internal.h @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * (C) Copyright 2025 Google LLC * * SPDX-License-Identifier: BSD-2-Clause-Patent @@ -470,10 +470,6 @@ struct vos_dtx_act_ent { * then 'dae_oids' points to the 'dae_oid_inline'. * * Otherwise, 'dae_oids' points to new buffer to hold more. - * - * These information is used for EC aggregation optimization. - * If server restarts, then we will lose the optimization but - * it is not fatal. */ daos_unit_oid_t *dae_oids; /* The time (hlc) when the DTX entry is created. */ @@ -485,6 +481,9 @@ struct vos_dtx_act_ent { /* Back pointer to the DTX handle. */ struct dtx_handle *dae_dth; + /* The capacity of dae_oids if it points to new allocated area. */ + uint32_t dae_oid_cap; + unsigned int dae_committable:1, dae_committing:1, dae_committed:1, @@ -855,6 +854,9 @@ vos_dtx_post_handle(struct vos_container *cont, int vos_dtx_act_reindex(struct vos_container *cont); +int +vos_dtx_record_oid(struct dtx_handle *dth, struct vos_container *cont, daos_unit_oid_t oid); + enum vos_tree_class { /** the first reserved tree class */ VOS_BTR_BEGIN = DBTREE_VOS_BEGIN, @@ -1336,7 +1338,8 @@ vos_evt_desc_cbs_init(struct evt_desc_cbs *cbs, struct vos_pool *pool, daos_handle_t coh, struct vos_object *obj); int -vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb); +vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb, + struct vos_object *obj); /** Finish the transaction and publish or cancel the reservations or * return if err == 0 and it's a multi-modification transaction that @@ -1928,20 +1931,6 @@ vos_io_scm(struct vos_pool *pool, daos_iod_type_t type, daos_size_t size, enum v return false; } -/** - * Insert object ID and its parent container into the array of objects touched by the ongoing - * local transaction. - * - * \param[in] dth DTX handle for ongoing local transaction - * \param[in] cont VOS container - * \param[in] oid Object ID - * - * \return 0 : Success. - * -DER_NOMEM : Run out of the volatile memory. - */ -int -vos_insert_oid(struct dtx_handle *dth, struct vos_container *cont, daos_unit_oid_t *oid); - static inline bool vos_pool_is_p2(struct vos_pool *pool) { diff --git a/src/vos/vos_io.c b/src/vos/vos_io.c index cebf9181aaa..afaf48cb320 100644 --- a/src/vos/vos_io.c +++ b/src/vos/vos_io.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2018-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -2552,32 +2552,6 @@ update_cancel(struct vos_io_context *ioc) true /* abort */); } -int -vos_insert_oid(struct dtx_handle *dth, struct vos_container *cont, daos_unit_oid_t *oid) -{ - struct dtx_local_oid_record *oid_array = NULL; - struct dtx_local_oid_record *record = NULL; - - /** The array has to grow to accommodate the next record. */ - if (dth->dth_local_oid_cnt == dth->dth_local_oid_cap) { - D_REALLOC_ARRAY(oid_array, dth->dth_local_oid_array, dth->dth_local_oid_cap, - dth->dth_local_oid_cap << 1); - if (oid_array == NULL) - return -DER_NOMEM; - - dth->dth_local_oid_array = oid_array; - dth->dth_local_oid_cap <<= 1; - } - - record = &dth->dth_local_oid_array[dth->dth_local_oid_cnt]; - record->dor_cont = cont; - vos_cont_addref(cont); - record->dor_oid = *oid; - dth->dth_local_oid_cnt++; - - return 0; -} - int vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, daos_size_t *size, struct dtx_handle *dth) @@ -2598,6 +2572,13 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, if (err != 0) goto abort; + if (unlikely(vos_obj_is_evicted(ioc->ic_pinned_obj))) { + D_DEBUG(DB_IO, "Obj " DF_UOID " is evicted during update, need to restart TX.\n", + DP_UOID(ioc->ic_oid)); + + D_GOTO(abort, err = -DER_TX_RESTART); + } + err = vos_ts_set_add(ioc->ic_ts_set, ioc->ic_cont->vc_ts_idx, NULL, 0); D_ASSERT(err == 0); @@ -2606,7 +2587,10 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, if (err != 0) goto abort; - err = vos_tx_begin(dth, umem, ioc->ic_cont->vc_pool->vp_sysdb); + if (ioc->ic_pinned_obj != NULL) + D_ASSERT(ioc->ic_pinned_obj == ioc->ic_obj); + + err = vos_tx_begin(dth, umem, ioc->ic_cont->vc_pool->vp_sysdb, ioc->ic_obj); if (err != 0) goto abort; @@ -2663,9 +2647,7 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, goto abort; } - if (dtx_is_valid_handle(dth) && dth->dth_local) { - err = vos_insert_oid(dth, ioc->ic_cont, &ioc->ic_oid); - } + err = vos_dtx_record_oid(dth, ioc->ic_cont, ioc->ic_oid); abort: if (err == -DER_NONEXIST || err == -DER_EXIST || @@ -2727,7 +2709,7 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, *size = ioc->ic_io_size; D_FREE(daes); D_FREE(dces); - vos_ioc_destroy(ioc, err != 0); + vos_ioc_destroy(ioc, err != 0 && tx_started); return err; } diff --git a/src/vos/vos_obj.c b/src/vos/vos_obj.c index 0015d91d916..117cf8baaab 100644 --- a/src/vos/vos_obj.c +++ b/src/vos/vos_obj.c @@ -1,6 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. - * (C) Copyright 2025 Hewlett Packard Enterprise Development LP + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -494,7 +494,7 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, if (rc != 0) goto reset; - rc = vos_tx_begin(dth, vos_cont2umm(cont), cont->vc_pool->vp_sysdb); + rc = vos_tx_begin(dth, vos_cont2umm(cont), cont->vc_pool->vp_sysdb, obj); if (rc != 0) goto reset; @@ -572,11 +572,9 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, } if (rc == 0) { - vos_ts_set_wupdate(ts_set, epr.epr_hi); - - if (dtx_is_valid_handle(dth) && dth->dth_local) { - rc = vos_insert_oid(dth, cont, &oid); - } + rc = vos_dtx_record_oid(dth, cont, oid); + if (rc == 0) + vos_ts_set_wupdate(ts_set, epr.epr_hi); } rc = vos_tx_end(cont, dth, NULL, NULL, tx_started, NULL, rc); @@ -592,7 +590,7 @@ vos_obj_punch(daos_handle_t coh, daos_unit_oid_t oid, daos_epoch_t epoch, } if (obj != NULL) - vos_obj_release(obj, 0, rc != 0); + vos_obj_release(obj, 0, rc != 0 && tx_started); D_FREE(daes); D_FREE(dces); @@ -816,7 +814,8 @@ vos_obj_mark_corruption(daos_handle_t coh, daos_epoch_t epoch, uint32_t pm_ver, daos_handle_t toh = DAOS_HDL_INVAL; int rc = 0; int i; - bool dirty = false; + bool dirty = false; + bool tx_started = false; cont = vos_hdl2cont(coh); D_ASSERT(cont != NULL); @@ -842,6 +841,7 @@ vos_obj_mark_corruption(daos_handle_t coh, daos_epoch_t epoch, uint32_t pm_ver, } } +restart: rc = vos_obj_hold(cont, oid, &epr, epoch, VOS_OBJ_VISIBLE | VOS_OBJ_CREATE, DAOS_INTENT_MARK, &obj, NULL); if (rc != 0) @@ -851,6 +851,16 @@ vos_obj_mark_corruption(daos_handle_t coh, daos_epoch_t epoch, uint32_t pm_ver, if (rc != 0) goto log; + if (unlikely(vos_obj_is_evicted(obj))) { + D_DEBUG(DB_IO, "Obj " DF_UOID " is evicted, needs to restart TX.\n", DP_UOID(oid)); + umem_tx_end(umm, -DER_TX_RESTART); + vos_obj_release(obj, 0, false); + obj = NULL; + goto restart; + } + + tx_started = true; + rc = vos_obj_incarnate(obj, &epr, epoch, VOS_OBJ_VISIBLE | VOS_OBJ_CREATE, DAOS_INTENT_MARK, NULL); if (rc != 0) @@ -906,12 +916,14 @@ vos_obj_mark_corruption(daos_handle_t coh, daos_epoch_t epoch, uint32_t pm_ver, ", dkey (empty), akey_nr %u, epoch " DF_X64 ", pm_ver %u", DP_UOID(oid), akey_nr, epoch, pm_ver); + if (rc == -DER_ALREADY) + rc = 0; if (daos_handle_is_valid(toh)) dbtree_close(toh); if (obj != NULL) - vos_obj_release(obj, 0, true); + vos_obj_release(obj, 0, rc != 0 && tx_started); - return rc == -DER_ALREADY ? 0 : rc; + return rc; } static int diff --git a/src/vos/vos_obj.h b/src/vos/vos_obj.h index f572ebb03d9..a424fae41ef 100644 --- a/src/vos/vos_obj.h +++ b/src/vos/vos_obj.h @@ -1,5 +1,6 @@ /** * (C) Copyright 2016-2024 Intel Corporation. + * (C) Copyright 2025-2026 Hewlett Packard Enterprise Development LP * * SPDX-License-Identifier: BSD-2-Clause-Patent */ @@ -123,6 +124,12 @@ void vos_obj_evict(struct vos_object *obj); int vos_obj_evict_by_oid(struct vos_container *cont, daos_unit_oid_t oid); +static inline bool +vos_obj_is_evicted(struct vos_object *obj) +{ + return obj != NULL && daos_lru_is_evicted(&obj->obj_llink); +} + /** * Create an object cache. * From bbabd94f87654c89584d692fa28739a15479dd1b Mon Sep 17 00:00:00 2001 From: Fan Yong Date: Wed, 7 Jan 2026 15:14:01 +0800 Subject: [PATCH 2/2] DAOS-18367 vos: handle review feedback Allow-unstable-test: true Signed-off-by: Fan Yong --- src/vos/vos_common.c | 4 ++-- src/vos/vos_io.c | 2 +- src/vos/vos_obj.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/vos/vos_common.c b/src/vos/vos_common.c index 0a154bf5cad..e5fe50dac97 100644 --- a/src/vos/vos_common.c +++ b/src/vos/vos_common.c @@ -223,7 +223,7 @@ vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb, if (dth == NULL) { /* CPU may yield when umem_tx_begin, related object maybe evicted during that. */ rc = umem_tx_begin(umm, vos_txd_get(is_sysdb)); - if (rc == 0 && unlikely(vos_obj_is_evicted(obj))) { + if (rc == 0 && obj != NULL && unlikely(vos_obj_is_evicted(obj))) { D_DEBUG(DB_IO, "Obj " DF_UOID " is evicted(1), need to restart TX.\n", DP_UOID(obj->obj_id)); rc = umem_tx_end(umm, -DER_TX_RESTART); @@ -246,7 +246,7 @@ vos_tx_begin(struct dtx_handle *dth, struct umem_instance *umm, bool is_sysdb, rc = umem_tx_begin(umm, vos_txd_get(is_sysdb)); if (rc == 0) { /* CPU may yield when umem_tx_begin, related object maybe evicted during that. */ - if (unlikely(vos_obj_is_evicted(obj))) { + if (obj != NULL && unlikely(vos_obj_is_evicted(obj))) { D_DEBUG(DB_IO, "Obj " DF_UOID " is evicted(2), need to restart TX.\n", DP_UOID(obj->obj_id)); diff --git a/src/vos/vos_io.c b/src/vos/vos_io.c index afaf48cb320..4d105b91412 100644 --- a/src/vos/vos_io.c +++ b/src/vos/vos_io.c @@ -2572,7 +2572,7 @@ vos_update_end(daos_handle_t ioh, uint32_t pm_ver, daos_key_t *dkey, int err, if (err != 0) goto abort; - if (unlikely(vos_obj_is_evicted(ioc->ic_pinned_obj))) { + if (ioc->ic_pinned_obj != NULL && unlikely(vos_obj_is_evicted(ioc->ic_pinned_obj))) { D_DEBUG(DB_IO, "Obj " DF_UOID " is evicted during update, need to restart TX.\n", DP_UOID(ioc->ic_oid)); diff --git a/src/vos/vos_obj.h b/src/vos/vos_obj.h index a424fae41ef..be67bd27ac6 100644 --- a/src/vos/vos_obj.h +++ b/src/vos/vos_obj.h @@ -127,7 +127,7 @@ int vos_obj_evict_by_oid(struct vos_container *cont, daos_unit_oid_t oid); static inline bool vos_obj_is_evicted(struct vos_object *obj) { - return obj != NULL && daos_lru_is_evicted(&obj->obj_llink); + return daos_lru_is_evicted(&obj->obj_llink); } /**