Source code for qharv.reel.scalar_dat
# Author: Yubo "Paul" Yang
# Email: yubo.paul.yang@gmail.com
# Routines to parse scalar table output. Mostly built around pandas's API.
import numpy as np
import pandas as pd
[docs]def read(dat_fname):
"""Read the scalar.dat file, should be table format.
The header line should start with '#' and contain column labels.
Args:
dat_fname (str): name of input file
Return:
pd.DataFrame: df containing the table of data
Example:
>>> df = read('vmc.s001.scalar.dat')
"""
with open(dat_fname, 'r') as f:
text = f.read()
return parse(text)
[docs]def write(dat_fname, df, header_pad='# ', **kwargs):
"""Write dataframe to plain text scalar table format
Lightly wrap around pandas.to_string with defaults to index and float_format
Args:
dat_fname (str): output data file name
df (pd.DataFrame): data
header_pad (str, optional): pad beginning of header with comment
string, default '# '
"""
default_kws = {
'index': False,
'float_format': '%8.6f'
}
for k, v in default_kws.items():
if k not in kwargs:
kwargs[k] = v
text = df.to_string(**kwargs)
with open(dat_fname, 'w') as f:
f.write(header_pad + text)
[docs]def parse(text):
"""Parse text of a scalar.dat file, should be table format.
Args:
text (str): content of scalar.dat file
Return:
pd.DataFrame: table data
Example:
>>> with open('vmc.s001.scalar.dat', 'r') as f:
>>> text = f.read()
>>> df = parse(text)
"""
fp = get_string_io(text)
# try to read header line
header = fp.readline().strip()
fp.seek(0)
# read data
sep = r'\s+'
if header.startswith('#'):
df = pd.read_csv(fp, sep=sep)
# remove first column name '#'
columns = df.columns
df.drop(columns[-1], axis=1, inplace=True)
df.columns = columns[1:]
# calculate local energy variance if possible (QMCPACK specific)
if ('LocalEnergy' in columns) and ('LocalEnergy_sq' in columns):
df['Variance'] = df['LocalEnergy_sq']-df['LocalEnergy']**2.
else:
df = pd.read_csv(fp, sep=sep, header=None)
fp.close()
# column labels should be strings
df.columns = map(str, df.columns)
return df
[docs]def read_to_list(dat_fname):
"""Read scalar.dat file into a list of pandas DataFrames
A line is a header if its first column cannot be converted to a float.
Many scalar.dat files can be concatenated. A list will be returned.
Args:
dat_fname (str): name of input file
Return:
list: list of df(s) containing the table(s) of data
Example:
>>> dfl = read_to_list('gctas.dat')
>>> df = pd.concat(dfl).reset_index(drop=True)
"""
# first separate out the header lines and parse them
with open(dat_fname, 'r') as f:
text = f.read()
idxl = find_header_lines(text)
lines = text.split('\n')
if len(idxl) == 0: # no header
return [parse(text)]
idxl.append(None)
# now read data and use headers to label columns
lines = text.split('\n')
dfl = []
for bidx, eidx in zip(idxl[:-1], idxl[1:]):
text1 = '\n'.join(lines[bidx:eidx])
df1 = parse(text1)
dfl.append(df1)
return dfl
[docs]def get_string_io(text):
"""Obtain StringIO object from text
compatible with Python 2 and 3
Args:
text (str): text to parse
Return:
StringIO: file-like object
"""
import sys
if sys.version_info[0] < 3:
from StringIO import StringIO
fp = StringIO(text)
else:
from io import StringIO
try:
fp = StringIO(text.decode())
except AttributeError as err:
fp = StringIO(text)
return fp
[docs]def error(trace, kappa=None):
"""Calculate the error of a trace of scalar data
Args:
trace (list): should be a 1D iterable array of floating point numbers
kappa (float,optional): auto-correlation time, default is to re-calculate
Return:
float: stderr, the error of the mean of this trace of scalars
"""
from qharv.reel.forlib.stats import corr
stddev = np.std(trace, ddof=1)
if np.isclose(stddev, 0): # easy case
return 0.0 # no error for constant trace
if kappa is None: # no call to corr
kappa = corr(trace)
neffective = np.sqrt(len(trace)/kappa)
err = stddev/neffective
return err
[docs]def single_column(df, column, nequil):
"""Calculate mean and error of a column
nequil blocks of data are thrown out; autocorrelation time is taken into
account when calculating error. The equilibrated data is assumed to have
Gaussian distribution. Error is calculated for one standard deviation
(1-sigma error).
Args:
df (pd.DataFrame): table of data (e.g. from scalar_dat.parse)
column (str): name of column
nequil (int): number of equilibration blocks
Return:
(float,float,float): (ymean,yerr,ycorr), where ymean is the mean of column
, yerr is the 1-sigma error of column, and ycorr is the autocorrelation
"""
from qharv.reel.forlib.stats import corr
myy = df[column].values[nequil:]
ymean = np.mean(myy)
ycorr = corr(myy)
yerr = error(myy, ycorr)
return ymean, yerr, ycorr