diff options
Diffstat (limited to 'python/experiments')
| -rw-r--r-- | python/experiments/test_dask.py | 19 |
1 files changed, 17 insertions, 2 deletions
diff --git a/python/experiments/test_dask.py b/python/experiments/test_dask.py index b790f9b4..ce1031da 100644 --- a/python/experiments/test_dask.py +++ b/python/experiments/test_dask.py @@ -1,8 +1,23 @@ import dask.dataframe as dd -converters = {f'Spread{t}': lambda s: float(s.rstrip('%'))/100 if s else np.nan \ +import numpy as np + +strip_percent = lambda s: float(s.rstrip('%'))/100 if s else np.nan +converters = {f'Spread{t}': strip_percent \ for t in ['6m'] + [f'{y}y' for y in [1, 2, 3, 4, 5, 7, 10, 15, 20, 30]]} -df = dd.read_csv("*.csv", skiprows=2, converters=converters, +converters['Recovery'] = strip_percent +df = dd.read_csv("/home/guillaume/composites/*.csv", skiprows=2, converters=converters, dtype={'CompositeLevel4y': 'object', 'Rating4y': 'object', 'CompositeDepth5y': 'object', 'AvRating': 'object', 'ImpliedRating':'object'}, parse_dates=['Date']) +df.drop(['CompositeLevel6m'] + [f'CompositeLevel{y}y' for y in \ + [1, 2, 3, 4, 5, 7, 10, 15, 20, 30]], axis=1) +for col in ['Ccy', 'DocClause', 'Tier', 'Sector', 'Region', 'Country']: + df[col] = df[col].astype('category') + +for col in ["Rating6m"] + [f"Rating{y}y" for y in [1, 2, 3, 4, 5, 7, 10, 15, 20, 30]]: + df[col] = df[col].astype('category') + +for engine in ["pyarrow", "fastparquet"]: + for comp in ["snappy", "brotli"]: + df.to_parquet(f"/home/guillaume/composites_parquet_{engine}_{comp}", engine=engine, compression=comp) |
