import dask.dataframe as dd import numpy as np strip_percent = lambda s: float(s.rstrip('%'))/100 if s else np.nan tenors = ['6m'] + [f'{y}y' for y in [1, 2, 3, 4, 5, 7, 10, 15, 20, 30]] converters = {f'Spread{t}': strip_percent \ for t in tenors} converters['Recovery'] = strip_percent dtype = {'Rating' + t: 'object' for t in tenors} dtype.update({'CompositeLevel' + t: 'object' for t in tenors}) dtype.update({'ImpliedRating': 'object', 'AvRating': 'object'}) df = dd.read_csv("/home/guillaume/composites/*.csv", skiprows=2, converters=converters, dtype=dtype, parse_dates=['Date']) df.drop(['CompositeLevel' + t for t in tenors] + \ ["Contributor"], axis=1) to_categorize = ['Ccy', 'DocClause', 'Tier', 'Sector', 'Region', 'Country'] + \ ["Rating" + t for t in tenors] df = df.categorize(columns=to_categorize, index=False) df = df.set_index("Date") df = df.repartition(npartitions=100) df.to_parquet(f"/home/guillaume/composites_parquet", engine="fastparquet", compression="snappy")