import dask.dataframe as dd import numpy as np strip_percent = lambda s: float(s.rstrip('%'))/100 if s else np.nan converters = {f'Spread{t}': strip_percent \ for t in ['6m'] + [f'{y}y' for y in [1, 2, 3, 4, 5, 7, 10, 15, 20, 30]]} converters['Recovery'] = strip_percent df = dd.read_csv("/home/guillaume/composites/*.csv", skiprows=2, converters=converters, dtype={'CompositeLevel4y': 'object', 'Rating4y': 'object', 'CompositeDepth5y': 'object', 'AvRating': 'object', 'ImpliedRating':'object'}, parse_dates=['Date']) df.drop(['CompositeLevel6m'] + [f'CompositeLevel{y}y' for y in \ [1, 2, 3, 4, 5, 7, 10, 15, 20, 30]], axis=1) for col in ['Ccy', 'DocClause', 'Tier', 'Sector', 'Region', 'Country']: df[col] = df[col].astype('category') for col in ["Rating6m"] + [f"Rating{y}y" for y in [1, 2, 3, 4, 5, 7, 10, 15, 20, 30]]: df[col] = df[col].astype('category') for engine in ["pyarrow", "fastparquet"]: for comp in ["snappy", "brotli"]: df.to_parquet(f"/home/guillaume/composites_parquet_{engine}_{comp}", engine=engine, compression=comp)