Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
10b8fd1
init commit working files
asdfblarg Dec 13, 2024
7efe33c
create spec class
asdfblarg Dec 13, 2024
220a89c
refactor to use new spec class
asdfblarg Dec 13, 2024
b424e2c
convert to script and add argparse
asdfblarg Dec 13, 2024
9ce1f0b
convert to script
asdfblarg Dec 13, 2024
7e31cdb
add additional type hints
asdfblarg Dec 13, 2024
f60e947
add comments
asdfblarg Dec 13, 2024
931395b
fix typo
asdfblarg Dec 13, 2024
2bf1130
fix bug with delimiter not being used
asdfblarg Dec 13, 2024
f6dc2ef
add tests
asdfblarg Dec 13, 2024
71d3e48
change default output name
asdfblarg Dec 13, 2024
f24bf1d
reorganize files
asdfblarg Dec 13, 2024
7858eb9
add requirements.txt
asdfblarg Dec 13, 2024
1cbe161
add README
asdfblarg Dec 13, 2024
8483345
move tests to separate folder
asdfblarg Dec 13, 2024
989b425
adjust readme
asdfblarg Dec 13, 2024
5dc88c6
add arg parsing to generate_fwf.py and update readme
asdfblarg Dec 13, 2024
5545a1f
fix test paths now that files are in the test folder
asdfblarg Dec 13, 2024
4ed4bea
add Dockerfile and update readme
asdfblarg Dec 13, 2024
ea32f2b
minor tweak
asdfblarg Dec 13, 2024
fb18c2d
another minor adjustment
asdfblarg Dec 13, 2024
7bbfce6
isort changes
asdfblarg Dec 13, 2024
fc0e5cc
init working problem 2
asdfblarg Dec 13, 2024
8f657ba
split up dask read and write into separate modular functions
asdfblarg Dec 13, 2024
c361df8
add readme
asdfblarg Dec 13, 2024
b44f9f5
fix bug with negative num_times
asdfblarg Dec 13, 2024
2848679
fix bug introduced by refactor
asdfblarg Dec 13, 2024
4fed19e
add tests
asdfblarg Dec 13, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions problem_1/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Use an official Python runtime as a base image
FROM python:3.12-slim

WORKDIR /src

COPY requirements.txt .

RUN pip install --no-cache-dir -r requirements.txt

COPY . .
39 changes: 39 additions & 0 deletions problem_1/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Problem 1

## Parse fixed width file

- Generate a fixed width file using the provided spec (offset provided in the spec file represent the length of each field).
- Implement a parser that can parse the fixed width file and generate a delimited file, like CSV for example.
- DO NOT use python libraries like pandas for parsing. You can use the standard library to write out a csv file (If you feel like)
- Language choices (Python or Scala)
- Deliver source via github or bitbucket
- Bonus points if you deliver a docker container (Dockerfile) that can be used to run the code (too lazy to install stuff that you might use)
- Pay attention to encoding

## Install
```pip install -r requirements.txt```

### Create a test fixed width file with a spec.json
```python generate_fwf.py -o output.fwf -s spec.json```
Use the `-h` flag for more details

### Parse a fixed width file to an output csv
```python parse_fwf.py -f output.fwf -o output.csv -s spec.json```
Use the `-h` flag for more details

### Run tests
```python -m pytest```
or
```pytest```

## Docker container
You can run docker a docker container with something like the following:
```
sudo docker build -t fwf-parse .
sudo docker run -it fwf-parse /bin/bash -c "/bin/bash"
```
One inside the container you can run the following as a quick test:
```
python src/generate_fwf.py -o output.fwf -s src/spec.json
python src/parse_fwf.py -f output.fwf -o output.csv -s src/spec.json
```
1 change: 1 addition & 0 deletions problem_1/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pytest==8.3.4
65 changes: 65 additions & 0 deletions problem_1/src/generate_fwf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import argparse

from spec import Spec


def generate_fwf_line(row_data: list[str], cols_data: dict):
"""Convert list of strings to fixed width string"""
line = ""
for i in range(len(cols_data)):
col_name, col_width = cols_data[i]

# truncate and pad data before adding to line
col_data = row_data[i][:col_width]
col_data = col_data.ljust(col_width)

line += col_data

return line


def write_fwf_file(
data_rows: list[list[str]],
spec: type[Spec],
output_filename: str = "output.fwf",
header: bool = True,
):
"""Generate fixed width file"""
with open(output_filename, "w", encoding=spec.fixed_width_encoding) as outfile:
# write header
if header:
header_row = generate_fwf_line(spec.column_names, spec.columns)
outfile.write(f"{header_row}\n")
# write row data
for row in data_rows:
data_row = generate_fwf_line(row, spec.columns)
outfile.write(f"{data_row}\n")

print(f"Generated '{output_filename}'")


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="This script generates a quick test fixed width file"
)
parser.add_argument(
"-o", "--output", help="Output delimited csv file name", default="output.fwf"
)
parser.add_argument("-s", "--spec", help="Spec json file name", default="spec.json")
args = parser.parse_args()

output_filename = args.output
spec_json_file = args.spec

spec = Spec(spec_json_file)

# generate example data_rows
# [['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'],
# ['k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't'],
# ['u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~']]
data = [chr(ord("a") + i) for i in range(len(spec.columns) * 3)]
num_cols = len(spec.columns)
num_rows = len(data) // num_cols
data_rows = [data[num_cols * i : num_cols * (i + 1)] for i in range(num_rows)]

write_fwf_file(data_rows, spec, output_filename, header=True)
83 changes: 83 additions & 0 deletions problem_1/src/parse_fwf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import argparse

from spec import Spec


def read_fwf_file(input_fn: str, spec: type[Spec]):
"""Read fixed width file and return data list"""
lines = []
try:
with open(input_fn, "r", encoding=spec.fixed_width_encoding) as file:
lines = file.readlines()
return [line.strip("\n") for line in lines]
except:
raise Exception("Error reading in: '{input_fn}'")


def parse_fwf_row(row_data: list[str], spec: type[Spec]):
"""Parse fixed width row data into list of strings"""
cur = 0
data = []
for i in range(spec.num_columns):
col_name, col_width = spec.columns[i]

col_data = row_data[cur : cur + col_width]
data.append(col_data.strip())

cur += col_width
return data


def parse_data(rows_data: list[str], spec: type[Spec]):
"""Parse all fwf rows into list of strings"""
parsed_data = []
for row in rows_data:
# only include non-empty rows
if row:
parsed_row = parse_fwf_row(row, spec)
parsed_data.append(parsed_row)
return parsed_data


def write_csv(filename: str, data: list[list[str]], spec: type[Spec], delimiter=","):
"""Write parsed fwf data into delimited csv file"""
with open(filename, "w", encoding=spec.delimited_encoding) as file:
if spec.include_header:
header_row = delimiter.join(spec.column_names)
file.write(f"{header_row}\n")

for row in data:
row_data = delimiter.join(row)
file.write(f"{row_data}\n")

print(f"Data parsed into: {filename}")


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="This script parses a fixed width file and writes a delimited csv file."
)
parser.add_argument(
"-f", "--file", help="Input fixed width file name", required=True
)
parser.add_argument(
"-o", "--output", help="Output delimited csv file name", required=True
)
parser.add_argument("-s", "--spec", help="Spec json file name", default="spec.json")
parser.add_argument("-d", "--delimiter", help="Spec json file name", default=",")
args = parser.parse_args()

fwf_filename = args.file
csv_filename = args.output
spec_file = args.spec
delimiter = args.delimiter

spec = Spec(spec_file)

fwf_rows = read_fwf_file(fwf_filename, spec)

if spec.include_header:
fwf_rows = fwf_rows[1:]

parsed_data = parse_data(fwf_rows, spec)
write_csv(csv_filename, parsed_data, spec, delimiter=delimiter)
File renamed without changes.
32 changes: 32 additions & 0 deletions problem_1/src/spec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import json


class Spec:
def __init__(self, spec_file: str):
self.spec_data: str = load_spec_json(spec_file)
self.column_names: list[str] = self.spec_data["ColumnNames"]
self.offsets: list[str] = self.spec_data["Offsets"]
self.fixed_width_encoding: str = self.spec_data["FixedWidthEncoding"]
self.include_header: bool = self.spec_data["IncludeHeader"]
self.delimited_encoding: str = self.spec_data["DelimitedEncoding"]

if len(self.column_names) != len(self.offsets):
raise Exception(f"Spec ColumnNames and Offsets are different lengths")

# Create dict with key: column_index, value: tuple(column_name, column_width)
self.columns: dict = {}
for i in range(len(self.column_names)):
self.columns[i] = (self.column_names[i], int(self.offsets[i]))

self.num_columns: int = len(self.columns)


def load_spec_json(spec_file: str):
try:
with open(spec_file) as file:
spec = json.load(file)
return spec
except FileNotFoundError:
raise FileNotFoundError(f"'{spec_file}' not found.")
except json.JSONDecodeError as e:
raise Exception(f"Error decoding '{spec_file}': {e}")
Empty file added problem_1/src/test/__init__.py
Empty file.
3 changes: 3 additions & 0 deletions problem_1/src/test/test.fwf
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
f1 f2 f3 f4
hellworldexpand truncate th
1 2 3 a
51 changes: 51 additions & 0 deletions problem_1/src/test/test_generate_fwf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import pytest
from generate_fwf import generate_fwf_line, write_fwf_file
from spec import Spec


@pytest.fixture
def spec():
"""Fixture spec"""
return Spec("src/test/test_spec.json")


def test_generate_fwf_line(spec):
"""Generate fix width line"""
row_data = ["hello", "world", "expand", "truncate this"]
generated_line = generate_fwf_line(row_data, spec.columns)

assert len(generated_line) == sum(
[col_width for col_name, col_width in spec.columns.values()]
)
assert generated_line == "hellworldexpand truncate th"


def test_write_fwf_file_header(spec, tmp_path):
output_file = f"{tmp_path}/output.fwf"
data_rows = [["hello", "world", "expand", "truncate this"], ["1", "2", "3", "a"]]

write_fwf_file(data_rows, spec, output_filename=output_file, header=True)

with open(output_file, "r") as file:
lines = file.readlines()

assert lines == [
"f1 f2 f3 f4 \n",
"hellworldexpand truncate th\n",
"1 2 3 a \n",
]


def test_write_fwf_file_no_header(spec, tmp_path):
output_file = f"{tmp_path}/output.fwf"
data_rows = [["hello", "world", "expand", "truncate this"], ["1", "2", "3", "a"]]

write_fwf_file(data_rows, spec, output_filename=output_file, header=False)

with open(output_file, "r") as file:
lines = file.readlines()

assert lines == [
"hellworldexpand truncate th\n",
"1 2 3 a \n",
]
99 changes: 99 additions & 0 deletions problem_1/src/test/test_parse_fwf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import json
from unittest.mock import mock_open, patch

import pytest
from parse_fwf import parse_data, read_fwf_file, write_csv
from spec import Spec


@pytest.fixture
def spec():
"""Fixture spec"""
return Spec("src/test/test_spec.json")


@pytest.fixture
def test_fwf():
"""Fixture spec"""
return "src/test/test.fwf"


@pytest.fixture
def parsed_data():
return [["hell", "world", "expand", "truncate th"], ["1", "2", "3", "a"]]


def test_read_fwf_file(spec, test_fwf):
"""Test reading the fixed-width file"""
lines = read_fwf_file(test_fwf, spec)
assert (
lines
== lines
== [
"f1 f2 f3 f4 ",
"hellworldexpand truncate th",
"1 2 3 a ",
]
)


def test_parse_data(spec):
"""Test parsing the fixed-width file data"""
raw_data = [
"hellworldexpand truncate th",
"1 2 3 a ",
"",
]
parsed_data = parse_data(raw_data, spec)

assert len(parsed_data) == 2
assert parsed_data[0] == ["hell", "world", "expand", "truncate th"]
assert parsed_data[1] == ["1", "2", "3", "a"]


def test_write_csv(spec, parsed_data, tmp_path):
"""Test writing CSV data"""
csv_file = f"{tmp_path}/output.csv"

write_csv(csv_file, parsed_data, spec)

with open(csv_file, "r") as file:
lines = file.readlines()

assert lines == [
"f1,f2,f3,f4\n",
"hell,world,expand,truncate th\n",
"1,2,3,a\n",
]


def test_write_csv_no_header(spec, parsed_data, tmp_path):
"""Test writing CSV data"""
csv_file = f"{tmp_path}/output.csv"

spec.include_header = False
write_csv(csv_file, parsed_data, spec)

with open(csv_file, "r") as file:
lines = file.readlines()

assert lines == [
"hell,world,expand,truncate th\n",
"1,2,3,a\n",
]


def test_write_csv_tab_delimiter(spec, parsed_data, tmp_path):
"""Test writing CSV data"""
csv_file = f"{tmp_path}/output.csv"

write_csv(csv_file, parsed_data, spec, delimiter="\t")

with open(csv_file, "r") as file:
lines = file.readlines()

assert lines == [
"f1\tf2\tf3\tf4\n",
"hell\tworld\texpand\ttruncate th\n",
"1\t2\t3\ta\n",
]
Loading