# Author: Yubo "Paul" Yang
# Email: yubo.paul.yang@gmail.com
# Routines to roughly process scalar Dataframes.
# Mostly built around pandas's API.
#
# note: A scalar dataframe (scalar_df) is expected to contain the raw data,
# i.e. block-resolved expectation values, of a SINGLE calculation.
# If multiple runs are collected in the same dataframe, label by ['path'
# , 'fdat'] and use groupby before applying the functions in this script.
import numpy as np
import pandas as pd
from qharv.reel.scalar_dat import error
[docs]def merge_list(dfl, labels):
"""Merge a list of DataFrames sharing common labels
Args:
dfl (list): a list of pd.DataFrame objects
labels (list): a list of column labels
Return:
pd.DataFrame: merged df
"""
import sys
if sys.version[0] != "2":
from functools import reduce
df = reduce(lambda df1, df2: pd.merge(df1, df2, on=labels), dfl)
return df
[docs]def mean_error_scalar_df(df, nequil=0):
""" get mean and average from a dataframe of raw scalar data (per-block)
take dataframe having columns ['LocalEnergy','Variance',...] to a
dataframe having columns ['LocalEnergy_mean','LocalEnergy_error',...]
Args:
df (pd.DataFrame): raw scalar dataframe, presumable generated using
qharv.scalar_dat.parse with extra labels columns added to identify
the different runs.
nequil (int, optional): number of equilibration blocks to throw out
for each run, default 0 (keep all data).
Returns:
pd.DataFrame: mean_error dataframe
"""
from qharv.sieve import mean_df
if nequil > 0:
if 'index' not in df.columns:
msg = 'time series must be indexed to drop equilibration,'
msg += ' please add "index" to DataFrame column.'
raise RuntimeError(msg)
sel = df['index'] >= nequil # zero indexing
mydf = df.loc[sel]
else: # allow equilibration to be dropped outside of this function
mydf = df
return mean_df.create(mydf)
[docs]def reblock(trace, block_size, min_nblock=4, with_sigma=False):
""" block scalar trace to remove autocorrelation;
see usage example in reblock_scalar_df
Args:
trace (np.array): a trace of scalars, may have multiple columns
!!!! assuming leading dimension is the number of current blocks.
block_size (int): size of block in units of current block.
min_nblock (int,optional): minimum number of blocks needed for
meaningful statistics, default is 4.
Returns:
np.array: re-blocked trace.
"""
nblock= len(trace)//block_size
nkeep = nblock*block_size
if (nblock < min_nblock):
raise RuntimeError('only %d blocks left after reblock' % nblock)
# end if
blocked_trace = trace[:nkeep].reshape(nblock, block_size, *trace.shape[1:])
ret = np.mean(blocked_trace, axis=1)
if with_sigma:
ret = (ret, np.std(blocked_trace, ddof=1, axis=1))
return ret
[docs]def reblock_scalar_df(df, block_size, min_nblock=4):
""" create a re-blocked scalar dataframe from a current scalar dataframe
see reblock for details
"""
return pd.DataFrame(
reblock(df.values, block_size, min_nblock=min_nblock),
columns=df.columns
)
[docs]def mix_est_correction(mydf, vseries, dseries, namesm,
series_name='series', group_name='group', kind='linear',
drop_missing_twists=False):
""" extrapolate dmc energy to zero time-step limit
Args:
mydf (pd.DataFrame): dataframe of VMC and DMC mixed estimators
vseries (int): VMC series id
dseries (int): DMC series id
names (list): list of DMC mixed estimators names to extrapolate
series_name (str,optional): column name identifying the series
kind (str,optinoal): extrapolation kind, must be either 'linear' or 'log'
Returns:
pd.Series: an entry copied from the smallest time-step DMC entry,
then edited with extrapolated pure estimators.
!!!! Series index is not changed!
"""
vsel = mydf[series_name] == vseries # vmc
msel = mydf[series_name] == dseries # mixed estimator
# make sure the groups (twists) are aligned!!!!
vgroup = set(mydf.loc[vsel, group_name].values)
dgroup = set(mydf.loc[msel, group_name].values)
missing_twists = (dgroup-vgroup).union(vgroup-dgroup)
nmiss = len(missing_twists)
if (nmiss > 0):
if (not drop_missing_twists):
msg = 'twists ' + ' '.join([str(t) for t in missing_twists])
msg += ' incomplete, set drop_missing_twists to ignore'
raise RuntimeError(msg)
else: # drop missing twists
good_twist = mydf.group.apply(lambda x: x not in missing_twists)
vsel = vsel & good_twist
msel = msel & good_twist
# end if
# end if
# get values and errors
mnames = [name+'_mean' for name in names]
enames = [name+'_error' for name in names]
vym = mydf.loc[vsel, mnames].values
vye = mydf.loc[vsel, enames].values
mym = mydf.loc[msel, mnames].values
mye = mydf.loc[msel, enames].values
# perform extrapolation
if kind == 'linear':
dym = 2.*mym - vym
dye = np.sqrt(4.*mye**2.+vye**2.)
elif kind == 'log':
# extrapolate mean
lnmym = np.log(mym)
lnvym = np.log(vym)
lndym = 2*lnmym-lnvym
dym = np.exp(lndym)
# propagate error
lnmye = np.log(mye)
lnvye = np.log(vye)
lndye = np.sqrt(4.*lnmye**2.+lnvye**2.)
dye = dym*lndye
else:
msg = 'unknown mixed estimator extrapolation kind = %s' % kind
raise RuntimeError(msg)
# end if
# store in new data frame
puredf = mydf.loc[msel].copy()
puredf[mnames] = dym
return puredf