diff options
Diffstat (limited to 'python/experiments/test_dask.py')
| -rw-r--r-- | python/experiments/test_dask.py | 39 |
1 files changed, 22 insertions, 17 deletions
diff --git a/python/experiments/test_dask.py b/python/experiments/test_dask.py index ce1031da..2e741b04 100644 --- a/python/experiments/test_dask.py +++ b/python/experiments/test_dask.py @@ -2,22 +2,27 @@ import dask.dataframe as dd import numpy as np strip_percent = lambda s: float(s.rstrip('%'))/100 if s else np.nan +tenors = ['6m'] + [f'{y}y' for y in [1, 2, 3, 4, 5, 7, 10, 15, 20, 30]] + converters = {f'Spread{t}': strip_percent \ - for t in ['6m'] + [f'{y}y' for y in [1, 2, 3, 4, 5, 7, 10, 15, 20, 30]]} + for t in tenors} converters['Recovery'] = strip_percent -df = dd.read_csv("/home/guillaume/composites/*.csv", skiprows=2, converters=converters, - dtype={'CompositeLevel4y': 'object', 'Rating4y': 'object', - 'CompositeDepth5y': 'object', 'AvRating': 'object', - 'ImpliedRating':'object'}, - parse_dates=['Date']) -df.drop(['CompositeLevel6m'] + [f'CompositeLevel{y}y' for y in \ - [1, 2, 3, 4, 5, 7, 10, 15, 20, 30]], axis=1) -for col in ['Ccy', 'DocClause', 'Tier', 'Sector', 'Region', 'Country']: - df[col] = df[col].astype('category') - -for col in ["Rating6m"] + [f"Rating{y}y" for y in [1, 2, 3, 4, 5, 7, 10, 15, 20, 30]]: - df[col] = df[col].astype('category') - -for engine in ["pyarrow", "fastparquet"]: - for comp in ["snappy", "brotli"]: - df.to_parquet(f"/home/guillaume/composites_parquet_{engine}_{comp}", engine=engine, compression=comp) +dtype = {'Rating' + t: 'object' for t in tenors} +dtype.update({'CompositeLevel' + t: 'object' for t in tenors}) +dtype.update({'ImpliedRating': 'object', + 'AvRating': 'object'}) +df = dd.read_csv("/home/guillaume/composites/*.csv", + skiprows=2, + converters=converters, + dtype=dtype, + parse_dates=['Date']) +df.drop(['CompositeLevel' + t for t in tenors] + \ + ["Contributor"], axis=1) +to_categorize = ['Ccy', 'DocClause', 'Tier', 'Sector', 'Region', 'Country'] + \ + ["Rating" + t for t in tenors] +df = df.categorize(columns=to_categorize, index=False) +df = df.set_index("Date") +print("I'm here") +df = df.repartition(npartitions=100) +print("now here") +df.to_parquet(f"/home/guillaume/composites_parquet_{engine}_default", engine=engine, compression="snappy") |
