DemystData · asdfblarg · Dec 13, 2024 · Dec 13, 2024 · Dec 13, 2024 · Dec 13, 2024
diff --git a/problem_1/Dockerfile b/problem_1/Dockerfile
@@ -0,0 +1,10 @@
+# Use an official Python runtime as a base image
+FROM python:3.12-slim
+
+WORKDIR /src
+
+COPY requirements.txt .
+
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY . .
diff --git a/problem_1/README.md b/problem_1/README.md
@@ -0,0 +1,39 @@
+# Problem 1
+
+## Parse fixed width file
+
+- Generate a fixed width file using the provided spec (offset provided in the spec file represent the length of each field).
+- Implement a parser that can parse the fixed width file and generate a delimited file, like CSV for example.
+- DO NOT use python libraries like pandas for parsing. You can use the standard library to write out a csv file (If you feel like)
+- Language choices (Python or Scala)
+- Deliver source via github or bitbucket
+- Bonus points if you deliver a docker container (Dockerfile) that can be used to run the code (too lazy to install stuff that you might use)
+- Pay attention to encoding
+
+## Install
+```pip install -r requirements.txt```
+
+### Create a test fixed width file with a spec.json
+```python generate_fwf.py -o output.fwf -s spec.json```
+Use the `-h` flag for more details
+
+### Parse a fixed width file to an output csv
+```python parse_fwf.py -f output.fwf -o output.csv -s spec.json```
+Use the `-h` flag for more details
+
+### Run tests
+```python -m pytest```
+or
+```pytest```
+
+## Docker container
+You can run docker a docker container with something like the following:
+```
+sudo docker build -t fwf-parse .
+sudo docker run -it fwf-parse /bin/bash -c "/bin/bash"
+```
+One inside the container you can run the following as a quick test:
+```
+python src/generate_fwf.py -o output.fwf -s src/spec.json
+python src/parse_fwf.py -f output.fwf -o output.csv -s src/spec.json
+```
diff --git a/problem_1/requirements.txt b/problem_1/requirements.txt
@@ -0,0 +1 @@
+pytest==8.3.4
diff --git a/problem_1/src/generate_fwf.py b/problem_1/src/generate_fwf.py
@@ -0,0 +1,65 @@
+import argparse
+
+from spec import Spec
+
+
+def generate_fwf_line(row_data: list[str], cols_data: dict):
+    """Convert list of strings to fixed width string"""
+    line = ""
+    for i in range(len(cols_data)):
+        col_name, col_width = cols_data[i]
+
+        # truncate and pad data before adding to line
+        col_data = row_data[i][:col_width]
+        col_data = col_data.ljust(col_width)
+
+        line += col_data
+
+    return line
+
+
+def write_fwf_file(
+    data_rows: list[list[str]],
+    spec: type[Spec],
+    output_filename: str = "output.fwf",
+    header: bool = True,
+):
+    """Generate fixed width file"""
+    with open(output_filename, "w", encoding=spec.fixed_width_encoding) as outfile:
+        # write header
+        if header:
+            header_row = generate_fwf_line(spec.column_names, spec.columns)
+            outfile.write(f"{header_row}\n")
+        # write row data
+        for row in data_rows:
+            data_row = generate_fwf_line(row, spec.columns)
+            outfile.write(f"{data_row}\n")
+
+        print(f"Generated '{output_filename}'")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="This script generates a quick test fixed width file"
+    )
+    parser.add_argument(
+        "-o", "--output", help="Output delimited csv file name", default="output.fwf"
+    )
+    parser.add_argument("-s", "--spec", help="Spec json file name", default="spec.json")
+    args = parser.parse_args()
+
+    output_filename = args.output
+    spec_json_file = args.spec
+
+    spec = Spec(spec_json_file)
+
+    # generate example data_rows
+    # [['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'],
+    # ['k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't'],
+    # ['u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~']]
+    data = [chr(ord("a") + i) for i in range(len(spec.columns) * 3)]
+    num_cols = len(spec.columns)
+    num_rows = len(data) // num_cols
+    data_rows = [data[num_cols * i : num_cols * (i + 1)] for i in range(num_rows)]
+
+    write_fwf_file(data_rows, spec, output_filename, header=True)
diff --git a/problem_1/src/parse_fwf.py b/problem_1/src/parse_fwf.py
@@ -0,0 +1,83 @@
+import argparse
+
+from spec import Spec
+
+
+def read_fwf_file(input_fn: str, spec: type[Spec]):
+    """Read fixed width file and return data list"""
+    lines = []
+    try:
+        with open(input_fn, "r", encoding=spec.fixed_width_encoding) as file:
+            lines = file.readlines()
+        return [line.strip("\n") for line in lines]
+    except:
+        raise Exception("Error reading in: '{input_fn}'")
+
+
+def parse_fwf_row(row_data: list[str], spec: type[Spec]):
+    """Parse fixed width row data into list of strings"""
+    cur = 0
+    data = []
+    for i in range(spec.num_columns):
+        col_name, col_width = spec.columns[i]
+
+        col_data = row_data[cur : cur + col_width]
+        data.append(col_data.strip())
+
+        cur += col_width
+    return data
+
+
+def parse_data(rows_data: list[str], spec: type[Spec]):
+    """Parse all fwf rows into list of strings"""
+    parsed_data = []
+    for row in rows_data:
+        # only include non-empty rows
+        if row:
+            parsed_row = parse_fwf_row(row, spec)
+            parsed_data.append(parsed_row)
+    return parsed_data
+
+
+def write_csv(filename: str, data: list[list[str]], spec: type[Spec], delimiter=","):
+    """Write parsed fwf data into delimited csv file"""
+    with open(filename, "w", encoding=spec.delimited_encoding) as file:
+        if spec.include_header:
+            header_row = delimiter.join(spec.column_names)
+            file.write(f"{header_row}\n")
+
+        for row in data:
+            row_data = delimiter.join(row)
+            file.write(f"{row_data}\n")
+
+    print(f"Data parsed into: {filename}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="This script parses a fixed width file and writes a delimited csv file."
+    )
+    parser.add_argument(
+        "-f", "--file", help="Input fixed width file name", required=True
+    )
+    parser.add_argument(
+        "-o", "--output", help="Output delimited csv file name", required=True
+    )
+    parser.add_argument("-s", "--spec", help="Spec json file name", default="spec.json")
+    parser.add_argument("-d", "--delimiter", help="Spec json file name", default=",")
+    args = parser.parse_args()
+
+    fwf_filename = args.file
+    csv_filename = args.output
+    spec_file = args.spec
+    delimiter = args.delimiter
+
+    spec = Spec(spec_file)
+
+    fwf_rows = read_fwf_file(fwf_filename, spec)
+
+    if spec.include_header:
+        fwf_rows = fwf_rows[1:]
+
+    parsed_data = parse_data(fwf_rows, spec)
+    write_csv(csv_filename, parsed_data, spec, delimiter=delimiter)
diff --git a/spec.json → problem_1/src/spec.json b/spec.json → problem_1/src/spec.json
diff --git a/problem_1/src/spec.py b/problem_1/src/spec.py
@@ -0,0 +1,32 @@
+import json
+
+
+class Spec:
+    def __init__(self, spec_file: str):
+        self.spec_data: str = load_spec_json(spec_file)
+        self.column_names: list[str] = self.spec_data["ColumnNames"]
+        self.offsets: list[str] = self.spec_data["Offsets"]
+        self.fixed_width_encoding: str = self.spec_data["FixedWidthEncoding"]
+        self.include_header: bool = self.spec_data["IncludeHeader"]
+        self.delimited_encoding: str = self.spec_data["DelimitedEncoding"]
+
+        if len(self.column_names) != len(self.offsets):
+            raise Exception(f"Spec ColumnNames and Offsets are different lengths")
+
+        # Create dict with key: column_index, value: tuple(column_name, column_width)
+        self.columns: dict = {}
+        for i in range(len(self.column_names)):
+            self.columns[i] = (self.column_names[i], int(self.offsets[i]))
+
+        self.num_columns: int = len(self.columns)
+
+
+def load_spec_json(spec_file: str):
+    try:
+        with open(spec_file) as file:
+            spec = json.load(file)
+        return spec
+    except FileNotFoundError:
+        raise FileNotFoundError(f"'{spec_file}' not found.")
+    except json.JSONDecodeError as e:
+        raise Exception(f"Error decoding '{spec_file}': {e}")
diff --git a/problem_1/src/test/__init__.py b/problem_1/src/test/__init__.py
diff --git a/problem_1/src/test/test.fwf b/problem_1/src/test/test.fwf
@@ -0,0 +1,3 @@
+f1  f2   f3        f4         
+hellworldexpand    truncate th
+1   2    3         a          
diff --git a/problem_1/src/test/test_generate_fwf.py b/problem_1/src/test/test_generate_fwf.py
@@ -0,0 +1,51 @@
+import pytest
+from generate_fwf import generate_fwf_line, write_fwf_file
+from spec import Spec
+
+
+@pytest.fixture
+def spec():
+    """Fixture spec"""
+    return Spec("src/test/test_spec.json")
+
+
+def test_generate_fwf_line(spec):
+    """Generate fix width line"""
+    row_data = ["hello", "world", "expand", "truncate this"]
+    generated_line = generate_fwf_line(row_data, spec.columns)
+
+    assert len(generated_line) == sum(
+        [col_width for col_name, col_width in spec.columns.values()]
+    )
+    assert generated_line == "hellworldexpand    truncate th"
+
+
+def test_write_fwf_file_header(spec, tmp_path):
+    output_file = f"{tmp_path}/output.fwf"
+    data_rows = [["hello", "world", "expand", "truncate this"], ["1", "2", "3", "a"]]
+
+    write_fwf_file(data_rows, spec, output_filename=output_file, header=True)
+
+    with open(output_file, "r") as file:
+        lines = file.readlines()
+
+    assert lines == [
+        "f1  f2   f3        f4         \n",
+        "hellworldexpand    truncate th\n",
+        "1   2    3         a          \n",
+    ]
+
+
+def test_write_fwf_file_no_header(spec, tmp_path):
+    output_file = f"{tmp_path}/output.fwf"
+    data_rows = [["hello", "world", "expand", "truncate this"], ["1", "2", "3", "a"]]
+
+    write_fwf_file(data_rows, spec, output_filename=output_file, header=False)
+
+    with open(output_file, "r") as file:
+        lines = file.readlines()
+
+    assert lines == [
+        "hellworldexpand    truncate th\n",
+        "1   2    3         a          \n",
+    ]
diff --git a/problem_1/src/test/test_parse_fwf.py b/problem_1/src/test/test_parse_fwf.py
@@ -0,0 +1,99 @@
+import json
+from unittest.mock import mock_open, patch
+
+import pytest
+from parse_fwf import parse_data, read_fwf_file, write_csv
+from spec import Spec
+
+
+@pytest.fixture
+def spec():
+    """Fixture spec"""
+    return Spec("src/test/test_spec.json")
+
+
+@pytest.fixture
+def test_fwf():
+    """Fixture spec"""
+    return "src/test/test.fwf"
+
+
+@pytest.fixture
+def parsed_data():
+    return [["hell", "world", "expand", "truncate th"], ["1", "2", "3", "a"]]
+
+
+def test_read_fwf_file(spec, test_fwf):
+    """Test reading the fixed-width file"""
+    lines = read_fwf_file(test_fwf, spec)
+    assert (
+        lines
+        == lines
+        == [
+            "f1  f2   f3        f4         ",
+            "hellworldexpand    truncate th",
+            "1   2    3         a          ",
+        ]
+    )
+
+
+def test_parse_data(spec):
+    """Test parsing the fixed-width file data"""
+    raw_data = [
+        "hellworldexpand    truncate th",
+        "1   2    3         a          ",
+        "",
+    ]
+    parsed_data = parse_data(raw_data, spec)
+
+    assert len(parsed_data) == 2
+    assert parsed_data[0] == ["hell", "world", "expand", "truncate th"]
+    assert parsed_data[1] == ["1", "2", "3", "a"]
+
+
+def test_write_csv(spec, parsed_data, tmp_path):
+    """Test writing CSV data"""
+    csv_file = f"{tmp_path}/output.csv"
+
+    write_csv(csv_file, parsed_data, spec)
+
+    with open(csv_file, "r") as file:
+        lines = file.readlines()
+
+    assert lines == [
+        "f1,f2,f3,f4\n",
+        "hell,world,expand,truncate th\n",
+        "1,2,3,a\n",
+    ]
+
+
+def test_write_csv_no_header(spec, parsed_data, tmp_path):
+    """Test writing CSV data"""
+    csv_file = f"{tmp_path}/output.csv"
+
+    spec.include_header = False
+    write_csv(csv_file, parsed_data, spec)
+
+    with open(csv_file, "r") as file:
+        lines = file.readlines()
+
+    assert lines == [
+        "hell,world,expand,truncate th\n",
+        "1,2,3,a\n",
+    ]
+
+
+def test_write_csv_tab_delimiter(spec, parsed_data, tmp_path):
+    """Test writing CSV data"""
+    csv_file = f"{tmp_path}/output.csv"
+
+    write_csv(csv_file, parsed_data, spec, delimiter="\t")
+
+    with open(csv_file, "r") as file:
+        lines = file.readlines()
+
+    assert lines == [
+        "f1\tf2\tf3\tf4\n",
+        "hell\tworld\texpand\ttruncate th\n",
+        "1\t2\t3\ta\n",
+    ]