Source code for qharv.reel.scalar_dat

# Author: Yubo "Paul" Yang
# Email: yubo.paul.yang@gmail.com
# Routines to parse scalar table output. Mostly built around pandas's API.

import numpy as np
import pandas as pd

[docs]def read(dat_fname):
  """Read the scalar.dat file, should be table format.
   The header line should start with '#' and contain column labels.

  Args:
    dat_fname (str): name of input file
  Return:
    pd.DataFrame: df containing the table of data
  Example:
    >>> df = read('vmc.s001.scalar.dat')
  """
  with open(dat_fname, 'r') as f:
    text = f.read()
  return parse(text)

[docs]def write(dat_fname, df, header_pad='# ', **kwargs):
  """Write dataframe to plain text scalar table format

  Lightly wrap around pandas.to_string with defaults to index and float_format

  Args:
    dat_fname (str): output data file name
    df (pd.DataFrame): data
    header_pad (str, optional): pad beginning of header with comment
     string, default '# '
  """
  default_kws = {
    'index': False,
    'float_format': '%8.6f'
  }
  for k, v in default_kws.items():
    if k not in kwargs:
      kwargs[k] = v
  text = df.to_string(**kwargs)
  with open(dat_fname, 'w') as f:
    f.write(header_pad + text)

[docs]def parse(text):
  """Parse text of a scalar.dat file, should be table format.

  Args:
    text (str): content of scalar.dat file
  Return:
    pd.DataFrame: table data
  Example:
    >>> with open('vmc.s001.scalar.dat', 'r') as f:
    >>>   text = f.read()
    >>>   df = parse(text)
  """
  fp = get_string_io(text)
  # try to read header line
  header = fp.readline().strip()
  fp.seek(0)
  # read data
  sep = r'\s+'
  if header.startswith('#'):
    df = pd.read_csv(fp, sep=sep)
    # remove first column name '#'
    columns = df.columns
    df.drop(columns[-1], axis=1, inplace=True)
    df.columns = columns[1:]
    # calculate local energy variance if possible (QMCPACK specific)
    if ('LocalEnergy' in columns) and ('LocalEnergy_sq' in columns):
      df['Variance'] = df['LocalEnergy_sq']-df['LocalEnergy']**2.
  else:
    df = pd.read_csv(fp, sep=sep, header=None)
  fp.close()
  # column labels should be strings
  df.columns = map(str, df.columns)
  return df

[docs]def read_to_list(dat_fname):
  """Read scalar.dat file into a list of pandas DataFrames

  A line is a header if its first column cannot be converted to a float.
  Many scalar.dat files can be concatenated. A list will be returned.

  Args:
    dat_fname (str): name of input file
  Return:
    list: list of df(s) containing the table(s) of data
  Example:
    >>> dfl = read_to_list('gctas.dat')
    >>> df = pd.concat(dfl).reset_index(drop=True)
  """
  # first separate out the header lines and parse them
  with open(dat_fname, 'r') as f:
    text = f.read()
  idxl = find_header_lines(text)
  lines = text.split('\n')
  if len(idxl) == 0:  # no header
    return [parse(text)]
  idxl.append(None)
  # now read data and use headers to label columns
  lines = text.split('\n')
  dfl = []
  for bidx, eidx in zip(idxl[:-1], idxl[1:]):
    text1 = '\n'.join(lines[bidx:eidx])
    df1 = parse(text1)
    dfl.append(df1)
  return dfl

[docs]def get_string_io(text):
  """Obtain StringIO object from text
   compatible with Python 2 and 3

  Args:
    text (str): text to parse
  Return:
    StringIO: file-like object
  """
  import sys
  if sys.version_info[0] < 3:
    from StringIO import StringIO
    fp = StringIO(text)
  else:
    from io import StringIO
    try:
      fp = StringIO(text.decode())
    except AttributeError as err:
      fp = StringIO(text)
  return fp

[docs]def find_header_lines(text):
  """Find line numbers of all headers

  Args:
    text (str): text to parse
  Return:
    list: a list of integer line numbers
  """
  def is_float(s):
    try:
      float(s)
      return True
    except ValueError:
      return False
  fp = get_string_io(text)
  first_str = np.array(
    [is_float(line.split()[0]) for line in fp], dtype=bool)
  fp.close()
  idxl = np.where(~first_str)[0]
  return idxl.tolist()

[docs]def error(trace, kappa=None):
  """Calculate the error of a trace of scalar data

  Args:
    trace (list): should be a 1D iterable array of floating point numbers
    kappa (float,optional): auto-correlation time, default is to re-calculate
  Return:
    float: stderr, the error of the mean of this trace of scalars
  """
  from qharv.reel.forlib.stats import corr
  stddev = np.std(trace, ddof=1)
  if np.isclose(stddev, 0):  # easy case
    return 0.0  # no error for constant trace
  if kappa is None:  # no call to corr
    kappa = corr(trace)
  neffective = np.sqrt(len(trace)/kappa)
  err = stddev/neffective
  return err

[docs]def single_column(df, column, nequil):
  """Calculate mean and error of a column

    nequil blocks of data are thrown out; autocorrelation time is taken into
  account when calculating error. The equilibrated data is assumed to have
  Gaussian distribution. Error is calculated for one standard deviation
  (1-sigma error).

  Args:
    df (pd.DataFrame): table of data (e.g. from scalar_dat.parse)
    column (str): name of column
    nequil (int): number of equilibration blocks
  Return:
    (float,float,float): (ymean,yerr,ycorr), where ymean is the mean of column
     , yerr is the 1-sigma error of column, and ycorr is the autocorrelation
  """
  from qharv.reel.forlib.stats import corr
  myy = df[column].values[nequil:]

  ymean = np.mean(myy)
  ycorr = corr(myy)
  yerr  = error(myy, ycorr)
  return ymean, yerr, ycorr