Source code for qharv.sieve.scalar_df

# Author: Yubo "Paul" Yang
# Email: yubo.paul.yang@gmail.com
# Routines to roughly process scalar Dataframes.
# Mostly built around pandas's API.
#
#  note: A scalar dataframe (scalar_df) is expected to contain the raw data,
# i.e. block-resolved expectation values, of a SINGLE calculation.
# If multiple runs are collected in the same dataframe, label by ['path'
# , 'fdat'] and use groupby before applying the functions in this script.
import numpy as np
import pandas as pd
from qharv.reel.scalar_dat import error

[docs]def merge_list(dfl, labels):
  """Merge a list of DataFrames sharing common labels

  Args:
    dfl (list): a list of pd.DataFrame objects
    labels (list): a list of column labels
  Return:
    pd.DataFrame: merged df
  """
  import sys
  if sys.version[0] != "2":
    from functools import reduce
  df = reduce(lambda df1, df2: pd.merge(df1, df2, on=labels), dfl)
  return df

[docs]def mean_error_scalar_df(df, nequil=0):
  """ get mean and average from a dataframe of raw scalar data (per-block)
   take dataframe having columns ['LocalEnergy','Variance',...] to a
   dataframe having columns ['LocalEnergy_mean','LocalEnergy_error',...]

   Args:
    df (pd.DataFrame): raw scalar dataframe, presumable generated using
     qharv.scalar_dat.parse with extra labels columns added to identify
     the different runs.
    nequil (int, optional): number of equilibration blocks to throw out
     for each run, default 0 (keep all data).
   Returns:
    pd.DataFrame: mean_error dataframe
  """
  from qharv.sieve import mean_df
  if nequil > 0:
    if 'index' not in df.columns:
      msg = 'time series must be indexed to drop equilibration,'
      msg += ' please add "index" to DataFrame column.'
      raise RuntimeError(msg)
    sel = df['index'] >= nequil  # zero indexing
    mydf = df.loc[sel]
  else:  # allow equilibration to be dropped outside of this function
    mydf = df
  return mean_df.create(mydf)

[docs]def reblock(trace, block_size, min_nblock=4, with_sigma=False):
  """ block scalar trace to remove autocorrelation;
  see usage example in reblock_scalar_df

  Args:
    trace (np.array): a trace of scalars, may have multiple columns
      !!!! assuming leading dimension is the number of current blocks.
    block_size (int): size of block in units of current block.
    min_nblock (int,optional): minimum number of blocks needed for
      meaningful statistics, default is 4.
  Returns:
    np.array: re-blocked trace.
  """
  nblock= len(trace)//block_size
  nkeep = nblock*block_size
  if (nblock < min_nblock):
    raise RuntimeError('only %d blocks left after reblock' % nblock)
  # end if
  blocked_trace = trace[:nkeep].reshape(nblock, block_size, *trace.shape[1:])
  ret = np.mean(blocked_trace, axis=1)
  if with_sigma:
    ret = (ret, np.std(blocked_trace, ddof=1, axis=1))
  return ret

[docs]def reblock_scalar_df(df, block_size, min_nblock=4):
  """ create a re-blocked scalar dataframe from a current scalar dataframe
   see reblock for details
  """
  return pd.DataFrame(
    reblock(df.values, block_size, min_nblock=min_nblock),
    columns=df.columns
  )

[docs]def poly_extrap_to_x0(myx, myym, myye, order, return_fit=False):
  """ fit 1D data to 1D polynomial and extrpolate to x=0

  The fit proceeds in two steps. The first polyfit does not take error into
  account. It estimates the extrapolated value, which is then used to setup
  a trust region (bounds). Using the trust region, curve_fit can robustly
  estimate the error of the extrapolation.

  Args:
    myx (np.array): x values
    myym (np.array): y values
    myye (np.array): y errors (1 sigma)
    order (int): order of 1D polynomial
    return_fit (bool, optional): if true, then return fit paramters
  Return:
    2-tuple: floats (y0m, y0e), y mean and error at x=0
  """
  import scipy.optimize as op

  if order != 1:
    raise NotImplementedError('order=%d not supported' % order)
  # keep target as zeroth parameter
  model = lambda x, a, b: a+b*x

  # setup trust region using 10*sigma around naive extrapolation

  #  first do a fit without error
  popt0 = np.polyfit(myx, myym, order)
  val0 = np.poly1d(popt0)(0)

  #  then use rough fit to setup trust region
  sig0 = max(myye)  # extrapolated error should be larger than all data
  nsig = 10  # !!!! hard-code 10 sigma
  lbounds = [-np.inf for i in range(order+1)]
  ubounds = [ np.inf for i in range(order+1)]
  lbounds[0] = val0 - nsig*sig0
  ubounds[0] = val0 + nsig*sig0
  bounds = (lbounds, ubounds)

  # finally fit using error and trust region
  popt, pcov = op.curve_fit(model, myx, myym,
    sigma=myye, absolute_sigma=True, bounds=bounds, method='trf')
  perr = np.sqrt(np.diag(pcov))
  # return popt,perr to check fit

  y0m = popt[0]
  y0e = perr[0]
  ret = (y0m, y0e)
  if return_fit:
    ret = (y0m, y0e, popt, pcov)
  return ret

[docs]def ts_extrap_obs(calc_df, sel, tname, obs, order=1):
  """ extrapolate a single dmc observable to zero time-step limit

  Args:
    calc_df (pd.DataFrame): must contain columns [tname, obs_mean, obs_error]
    sel (np.array): boolean selector array
    tname (str): timestep column name, e.g. 'timestep'
    obs (str): observable column name, e.g. 'LocalEnergy'
  Return:
    tuple: (myx, y0m, y0e) of type (list, float, float) containing
    (timesteps, t=0 value, t=0 error)
  """

  # !!!! need to check that the selected runs are actually DMC !
  myx  = np.array(calc_df.loc[sel, tname].values)
  myym = np.array(calc_df.loc[sel, obs+'_mean'].values)
  myye = np.array(calc_df.loc[sel, obs+'_error'].values)

  y0m, y0e = poly_extrap_to_x0(myx, myym, myye, order)

  return myx, y0m, y0e

[docs]def ts_extrap(calc_df, issl, obsl,
  tname='timestep', series_name='series', **kwargs):
  """ extrapolate all dmc observables to zero time-step limit

  Args:
    calc_df (pd.DataFrame): must contain columns [tname, series_name]
    issl (list): list of DMC series index to use in fit
    obsl (list): a list of observable names to extrapolate
  Return:
    pd.Series: an entry copied from the smallest time-step DMC entry,
    then edited with extrapolated energy and corresponding info
    !!!! series number is unchanged
  """

  sel  = calc_df[series_name].apply(lambda x: x in issl)
  nfound = len(calc_df.loc[sel])
  if nfound != len(issl):
    msg = 'found %d series, when %d are requested' % (nfound, len(issl))
    raise RuntimeError(msg)

  # copy smallest timestep DMC entry
  myx  = calc_df.loc[sel, tname]
  entry = calc_df.loc[calc_df[tname] == min(myx)].copy()

  # fill entry with new data
  entry[tname] = 0

  for obs in obsl:
    myx0, y0m, y0e = ts_extrap_obs(calc_df, sel,
      tname, obs, **kwargs)
    entry['%s_mean' % obs]  = y0m
    entry['%s_error' % obs] = y0e
  return entry

[docs]def mix_est_correction(mydf, vseries, dseries, namesm,
  series_name='series', group_name='group', kind='linear',
  drop_missing_twists=False):
  """ extrapolate dmc energy to zero time-step limit
  Args:
    mydf (pd.DataFrame): dataframe of VMC and DMC mixed estimators
    vseries (int): VMC series id
    dseries (int): DMC series id
    names (list): list of DMC mixed estimators names to extrapolate
    series_name (str,optional): column name identifying the series
    kind (str,optinoal): extrapolation kind, must be either 'linear' or 'log'
  Returns:
    pd.Series: an entry copied from the smallest time-step DMC entry,
      then edited with extrapolated pure estimators.
      !!!! Series index is not changed!
  """
  vsel = mydf[series_name] == vseries  # vmc
  msel = mydf[series_name] == dseries  # mixed estimator

  # make sure the groups (twists) are aligned!!!!
  vgroup = set(mydf.loc[vsel, group_name].values)
  dgroup = set(mydf.loc[msel, group_name].values)

  missing_twists = (dgroup-vgroup).union(vgroup-dgroup)
  nmiss = len(missing_twists)
  if (nmiss > 0):
    if (not drop_missing_twists):
      msg = 'twists ' + ' '.join([str(t) for t in missing_twists])
      msg += ' incomplete, set drop_missing_twists to ignore'
      raise RuntimeError(msg)
    else:  # drop missing twists
      good_twist = mydf.group.apply(lambda x: x not in missing_twists)
      vsel = vsel & good_twist
      msel = msel & good_twist
    # end if
  # end if

  # get values and errors
  mnames = [name+'_mean' for name in names]
  enames = [name+'_error' for name in names]
  vym = mydf.loc[vsel, mnames].values
  vye = mydf.loc[vsel, enames].values
  mym = mydf.loc[msel, mnames].values
  mye = mydf.loc[msel, enames].values

  # perform extrapolation
  if kind == 'linear':
    dym = 2.*mym - vym
    dye = np.sqrt(4.*mye**2.+vye**2.)
  elif kind == 'log':
    # extrapolate mean
    lnmym = np.log(mym)
    lnvym = np.log(vym)
    lndym = 2*lnmym-lnvym
    dym = np.exp(lndym)

    # propagate error
    lnmye = np.log(mye)
    lnvye = np.log(vye)
    lndye = np.sqrt(4.*lnmye**2.+lnvye**2.)
    dye = dym*lndye
  else:
    msg = 'unknown mixed estimator extrapolation kind = %s' % kind
    raise RuntimeError(msg)
  # end if

  # store in new data frame
  puredf = mydf.loc[msel].copy()
  puredf[mnames] = dym
  return puredf