aboutsummaryrefslogtreecommitdiffstats
path: root/python/experiments
diff options
context:
space:
mode:
Diffstat (limited to 'python/experiments')
-rw-r--r--python/experiments/test_dask.py39
1 files changed, 22 insertions, 17 deletions
diff --git a/python/experiments/test_dask.py b/python/experiments/test_dask.py
index ce1031da..2e741b04 100644
--- a/python/experiments/test_dask.py
+++ b/python/experiments/test_dask.py
@@ -2,22 +2,27 @@ import dask.dataframe as dd
import numpy as np
strip_percent = lambda s: float(s.rstrip('%'))/100 if s else np.nan
+tenors = ['6m'] + [f'{y}y' for y in [1, 2, 3, 4, 5, 7, 10, 15, 20, 30]]
+
converters = {f'Spread{t}': strip_percent \
- for t in ['6m'] + [f'{y}y' for y in [1, 2, 3, 4, 5, 7, 10, 15, 20, 30]]}
+ for t in tenors}
converters['Recovery'] = strip_percent
-df = dd.read_csv("/home/guillaume/composites/*.csv", skiprows=2, converters=converters,
- dtype={'CompositeLevel4y': 'object', 'Rating4y': 'object',
- 'CompositeDepth5y': 'object', 'AvRating': 'object',
- 'ImpliedRating':'object'},
- parse_dates=['Date'])
-df.drop(['CompositeLevel6m'] + [f'CompositeLevel{y}y' for y in \
- [1, 2, 3, 4, 5, 7, 10, 15, 20, 30]], axis=1)
-for col in ['Ccy', 'DocClause', 'Tier', 'Sector', 'Region', 'Country']:
- df[col] = df[col].astype('category')
-
-for col in ["Rating6m"] + [f"Rating{y}y" for y in [1, 2, 3, 4, 5, 7, 10, 15, 20, 30]]:
- df[col] = df[col].astype('category')
-
-for engine in ["pyarrow", "fastparquet"]:
- for comp in ["snappy", "brotli"]:
- df.to_parquet(f"/home/guillaume/composites_parquet_{engine}_{comp}", engine=engine, compression=comp)
+dtype = {'Rating' + t: 'object' for t in tenors}
+dtype.update({'CompositeLevel' + t: 'object' for t in tenors})
+dtype.update({'ImpliedRating': 'object',
+ 'AvRating': 'object'})
+df = dd.read_csv("/home/guillaume/composites/*.csv",
+ skiprows=2,
+ converters=converters,
+ dtype=dtype,
+ parse_dates=['Date'])
+df.drop(['CompositeLevel' + t for t in tenors] + \
+ ["Contributor"], axis=1)
+to_categorize = ['Ccy', 'DocClause', 'Tier', 'Sector', 'Region', 'Country'] + \
+ ["Rating" + t for t in tenors]
+df = df.categorize(columns=to_categorize, index=False)
+df = df.set_index("Date")
+print("I'm here")
+df = df.repartition(npartitions=100)
+print("now here")
+df.to_parquet(f"/home/guillaume/composites_parquet_{engine}_default", engine=engine, compression="snappy")