|
2 | 2 | """Generic Pipeline object to define how DVE should be interacted with.""" |
3 | 3 | from itertools import starmap |
4 | 4 | import json |
| 5 | +from pathlib import Path |
5 | 6 | import re |
6 | 7 | from collections import defaultdict |
7 | 8 | from collections.abc import Generator, Iterable, Iterator |
|
16 | 17 |
|
17 | 18 | from dve.core_engine.exceptions import CriticalProcessingError |
18 | 19 | from dve.core_engine.message import FeedbackMessage |
| 20 | +from dve.parser.file_handling.implementations.file import LocalFilesystemImplementation |
| 21 | +from dve.parser.file_handling.service import _get_implementation |
19 | 22 | import dve.reporting.excel_report as er |
20 | 23 | from dve.core_engine.backends.base.auditing import BaseAuditingManager |
21 | 24 | from dve.core_engine.backends.base.contract import BaseDataContract |
@@ -635,7 +638,18 @@ def business_rule_step( |
635 | 638 | ) |
636 | 639 |
|
637 | 640 | return successful_files, unsucessful_files, failed_processing |
638 | | - |
| 641 | + |
| 642 | + def _publish_error_aggregates(self, submission_id:str, aggregates_df: pl.DataFrame) -> URI: |
| 643 | + """Store error aggregates as parquet for auditing""" |
| 644 | + output_uri = fh.joinuri(self.processed_files_path, submission_id, "audit", "error_aggregates.parquet") |
| 645 | + if isinstance(_get_implementation(output_uri), LocalFilesystemImplementation): |
| 646 | + output_uri = fh.file_uri_to_local_path(output_uri) |
| 647 | + output_uri.parent.mkdir(parents=True, exist_ok=True) |
| 648 | + output_uri = output_uri.as_posix() |
| 649 | + aggregates_df = aggregates_df.with_columns(pl.lit(submission_id).alias("submission_id")) |
| 650 | + aggregates_df.write_parquet(output_uri) |
| 651 | + return output_uri |
| 652 | + |
639 | 653 | @lru_cache() # noqa: B019 |
640 | 654 | def _get_error_dataframes(self, submission_id: str): |
641 | 655 | if not self.processed_files_path: |
@@ -738,6 +752,8 @@ def error_report(self, |
738 | 752 | ) |
739 | 753 | with fh.open_stream(report_uri, "wb") as stream: |
740 | 754 | stream.write(er.ExcelFormat.convert_to_bytes(workbook)) |
| 755 | + |
| 756 | + self._publish_error_aggregates(submission_info.submission_id, aggregates) |
741 | 757 |
|
742 | 758 | return submission_info, submission_status, sub_stats, report_uri |
743 | 759 |
|
@@ -842,3 +858,4 @@ def cluster_pipeline_run( |
842 | 858 | ) |
843 | 859 |
|
844 | 860 | yield from report_results # type: ignore |
| 861 | + |
0 commit comments