1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
|
import dask.dataframe as dd
import numpy as np
strip_percent = lambda s: float(s.rstrip('%'))/100 if s else np.nan
converters = {f'Spread{t}': strip_percent \
for t in ['6m'] + [f'{y}y' for y in [1, 2, 3, 4, 5, 7, 10, 15, 20, 30]]}
converters['Recovery'] = strip_percent
df = dd.read_csv("/home/guillaume/composites/*.csv", skiprows=2, converters=converters,
dtype={'CompositeLevel4y': 'object', 'Rating4y': 'object',
'CompositeDepth5y': 'object', 'AvRating': 'object',
'ImpliedRating':'object'},
parse_dates=['Date'])
df.drop(['CompositeLevel6m'] + [f'CompositeLevel{y}y' for y in \
[1, 2, 3, 4, 5, 7, 10, 15, 20, 30]], axis=1)
for col in ['Ccy', 'DocClause', 'Tier', 'Sector', 'Region', 'Country']:
df[col] = df[col].astype('category')
for col in ["Rating6m"] + [f"Rating{y}y" for y in [1, 2, 3, 4, 5, 7, 10, 15, 20, 30]]:
df[col] = df[col].astype('category')
for engine in ["pyarrow", "fastparquet"]:
for comp in ["snappy", "brotli"]:
df.to_parquet(f"/home/guillaume/composites_parquet_{engine}_{comp}", engine=engine, compression=comp)
|