/************************************************************

 stat2Bin.cxx is a C++ program which counts a list of input
 values that have been sampled along one coordinate (by any other
 program) into bins of equal size. The input is an ASCII file
 with two types of lines
 i) lines starting with a hash (#) which are ignored
 ii) lines starting with any white space followed by a floating
    point number (like 1.23e5 or 3332 or -12e-5) optionally
    followed by more white space and other bytes which are ignored.
 These first numbers in the
 non-comment lines are the sample that is to be binned.

 The standard output contains some comments (i.e., lines that
 start with a hash so they are ignored by parsers like gnuplot)
 which contain the mean and standard deviation over the input
 samples (computed independent of the binning parameters). The
 output contains also lines with three columns which contain 
 from the left to the right
 i) the center of the bin in the units of the input data.
   If the preprocessor variable STAT2BIN_ONE_OUT_PER_BIN is
   commented before compilation, the data are duplicated in the
   sense that two lines per bin will be produced with the
   first representing the (x,y) coordinate on the left upper
   corner of the bin graph, the 2nd representing the (x,y)
   coordinate of the right upper corner of the graph.
 ii) the positive integer number of samples falling into that bin
 iii) the cumulative sum over the data falling into this
   or any prior bin. This is useful to quickly get
   percentiles if the last number in the last row is subdivided
   and the result backwards interpolated from the last/third row
   into the first.
 The bins are listed continously, which means bins without
 a hit in the input data are explicitly listed with a zero in the
 second column.

 Complaints on use etc are forwarded to stderr.
    

Compilation:
	make stat2Bin

	Some compilers may eject a warning about double-to-integer
	conversion. The coding is correct according to the standards
	and has not been adapted to such behavior.

Syntax:
	stat2Bin [-b binwidth] [-f from] [-t to] [-k field] inputf [inputf2 ....] > outputf

        Command line options and arguments:
        -b the width of each bin in the units of the data of the
           input file. If the option is not used, a default of 1
           is assumed.

        -f smallest input value at the "lower" edge of the first
           (leftmost) bin. If the value of this option happens to
           be larger than some of the values in the input file,
	   the first bin piles up these too.
           If the option is not used, a default of 0 is assumed.

        -t highest input value at the "upper" edge of the last
           (rightmost) bin. If the value of this option happens to
           be smaller than some of the values in the input file,
	   the last bin piles up these too.
           If the option is not used, a default of 100 is assumed.

        -k key is the field number of the input in each input line,
           where sequences of blanks are delimiters. In a left-to-right
           scan of the input lines, key-1 values are skipped before
           reading the value for the statistics
           If the option is not used, a default of 1 is assumed.

        inputf this is at least one mandatory command line argument which 
           specifies the UNIX file name of the input data. If more arguments
           follow, the concatenation of all of them is used for the statistics.

Example:
	cat wdsnew*.txt | wdsToSql.pl > wds.sql
	sqlite3
	sqlite> .read wds.sql
	sqlite> .output wdstmp
	sqlite> SELECT sepfst FROM wds WHERE magfst < 10;
	sqlite> .output wdstmp2
	sqlite> SELECT sepfst FROM wds WHERE magfst < 10 AND magsnd <10;
	sqlite> .read orb6.sql
	sqlite> .output wdstmp3
	sqlite> SELECT a FROM orb6 WHERE magfst < '10' AND au='a' AND ( Pu='y' AND P<'4' AND P>'0.5' OR Pu='d' AND P>'182' AND P<'1460');
	sqlite> SELECT 0.001*a FROM orb6 WHERE magfst < '10' AND au='m' AND ( Pu='y' AND P<'4' AND P>'0.5' OR Pu='d' AND P>'182' AND P<'1460');
	sqlite> .output wdstmp4
	sqlite> SELECT a FROM orb6 WHERE magfst < '10' AND au='a' AND ( Pu='y' AND P>='4' OR Pu='d' AND P>'1460');
	sqlite> SELECT 0.001*a FROM orb6 WHERE magfst < '10' AND au='m' AND ( Pu='y' AND P>='4' OR Pu='d' AND P>'1460');
	sqlite> .exit
	
	stat2Bin -b 2 -f 0.01 -t 120 wdstmp > wdsSepStat

samples input data from 0 to 120 as into bins of width 2.

 Richard J. Mathar, http://www.mpia-hd.mpg.de/~mathar, 2021-07-29
*************************************************************/
#include <cmath>
#include <cstring>
#include <cstdio>
#include <cstdlib>
#include <climits>
#include <unistd.h>
#include <ctime>
#include <iostream>
#include <iomanip>
#include <fstream>
#include <vector>
#include <algorithm>

/* maximum number of bins on output; increase in the extraordinary
* case that this is not fine enough 
*/
#define STAT2BIN_MAXBINNR 6000

/* generally, try to define steps such that
*  gnuplot> plot "outputf" wi li
*  will plot a nice histogram. The definition of the following macro assumes that
*  only one point per bin is generated, as with
*  gnuplot> set style data histeps
*  gnuplot> plot "outputf"
*/
#define STAT2BIN_ONE_OUT_PER_BIN

using namespace std ;

void usage(char *argv0)
{
	cerr << "usage: " << argv0 << "[-b binwidth] [-f from] [-t to] infile > outfile" << endl ;
}

/** compute quantile statistics over the values of the vector
* @param[in,out] dat the vector of the input values. On output, the
*     vector is sorted along increasing values.
* @param[in] qu is a value between 0 and 1 corresponding to the
*      quantile to be computed. For the median use 0.5, for the two sigma
*      range (95.4 percent) use 0.977249868 or 0.022750132, for the three
*      sigma range (99.7 percent) use 0.9986501 or 0.0013499.
* @return the abscissa value on the cumulative sum. On the curve of
*      the cumulative sum of the values in \c dat, 100*qu percent
*      of the values are smaller than the value returned.
* @since 2007-02-12
* @author Richard J. Mathar
*/
double quanti(vector<double> &dat, const double qu)
{
	/** we first sort the values with a call to an STL sort()
	* such that a quick one-step interpolation between indices and values
	* returns the value.
	*/
#ifdef TEST
	cout << "# unsorted " << endl ;
	for(int i=0 ; i < dat.size() ; i++)
		cout << i << " " << dat[i] << endl ;
#endif
	sort(dat.begin(),dat.end()) ;
#ifdef TEST
	cout << "# sorted " << endl ;
	for(int i=0 ; i < dat.size() ; i++)
		cout << i << " " << dat[i] << endl ;
#endif
	const int N=dat.size() ;
	if( qu <= 0.)
		return 2.*dat[0]-dat[1] ;
	else if ( qu >= 1.)
		return 2.*dat[N-1]-dat[N-2] ;
	else 
	{
		/* This \c indx says that \c fndx of the values (out of a total of \c N)
		* are to be left of the abscissa value (to be returned), the other right to it.
		* The subtraction of 1 indicates the conversion to the 0-based C/C++ indexing.
		* Example: if qu*N=4.2, the four values with C-indices 0,1,2 and 3 are to be
		* smaller than the value returned.
		*/
		double fndx = qu*N-1 ;
		int indx = fndx ;
		/* The return value is obtained by linear interpolation between
		* ( indx,dat[indx]) and (indx+1,dat[indx+1]).
		*/
		return dat[indx]+fmod(indx,1.)*(dat[indx+1]-dat[indx]) ;
	}
}

/** compute quantile statistics over the values of the vector
* @param[in,out] dat the vector of the input values
* @param[out] qu on return, qu[0] contains the median (50 percent quantile),
*      qu[0] contains the 15.86 percent quantile (which is the 1 sigma value
*      of a normal distribution, qu[1] contains the 84.13 percent quantile (which
*      is the 1 sigma value of a normal distribution.
* @since 2007-02-12
* @author Richard J. Mathar
*/
void quanti(vector<double> &dat, double qu[3])
{
	qu[0]=quanti(dat,0.5) ;
	qu[1]=quanti(dat,0.158655254) ;
	qu[2]=quanti(dat,0.841344746) ;
}

/** Main executable of stat2Bin.
* @param[in] argc the number of UNIX command line arguments, including
*    the name of the executable. Correct use means this parameter is between
*    2 (only the infile is given) up to 8 (all three options are used).
* @param[in] argv pointers to the starts of the command line arguments.
* @return 0 if successful, 1 if not.
*    The return value of 1 indicates either
*    - the mandatory command line argument with the file name is missing
*    - the file name does not refer to a readable file
* @since 2005-11-07
* @since 2020-07-29 remove dependency on input line lengths
* @since 2020-07-29 Accumulate more than one input file.
* @since 2020-07-29 Support -k option to skip columns
* @author Richard J. Mathar
*/
int main(int argc, char *argv[])
{
	float valBw = 1. ;	// default is binning in slices of 1. "data units"
	int key=1 ;

	/* sampling range. vrange[0] the minimum value, default zero,
	* and vrange[1] the maximum value, default 100.
	*/
	double vrange[2] ={0., 100.} ;
	if( argc < 2)
	{
		usage(argv[0]) ;
		return 1 ;
	}

	/* The flag of any command line option */
	char oc ;

	while (  (oc=getopt(argc,argv,"b:f:t:k:")) != -1 )
	{
		switch(oc)
		{
		case 'f' :
			vrange[0] = atof(optarg) ;
			break ;
		case 'b' :
			valBw = atof(optarg) ;
			break ;
		case 't' :
			vrange[1] = atof(optarg) ;
			break ;
		case 'k' :
			key = atoi(optarg) ;
			break ;
		case '?' :
			cerr << "Invalid command line option " << optopt << endl ;
			break ;
		}
	}

	/* \c counts is the array of bin occupancies, initially zero
	* (the initialization is not needed with C++ compilers that comply...)
	*/
	int counts[STAT2BIN_MAXBINNR] ;
	for(int i=0 ; i < STAT2BIN_MAXBINNR; i++)
		counts[i] =0 ;

	/* countsm[0] keeps track of the minimum bin index that has
	* at least occupancy 1. countsm[1] keeps track of the maximum index that
	* has at least occupancy 1. The total number of input values is kept
	* in countsm[2].
	*/
	int countsm[3] ;
	countsm[0]= STAT2BIN_MAXBINNR ; countsm[1]=countsm[2]=0 ;

	/* for statistical analysis: sum of the values and sum of the values squared
	* No values have been read yet and these start at zero.
	*/
	double moms[2] = {0., 0.} ;

	/** For some kind of higher statistics, we accumulate the entire input
	* vector in \c dat
	*/
	vector<double> dat ;

	/* loop over the input files
	*/
	for(int i = optind ; i < argc ; i++)
	{
		ifstream f;
		f.open(argv[i]) ;
		if ( ! f)
		{
			cerr << "Coudn't open " << argv[i] << endl ;
			continue ;
		}

		for(;;)
		{
			string buf ;
			/* read the next line from the input file
			*/
			getline(f,buf) ;
	
			/* skip it if it starts with a hash 
			*/
			if ( strncmp(buf.c_str(),"#",1) && buf.length()>0 ) 	// a line  that does not start with the hash
			{
#ifdef TEST
				cerr << "reading " << buf << endl ;
#endif
				/* read the value into \c val . Skip some if key>1.
				*/
				string skippedval ;
				istringstream s(buf) ;
				for(int sk=1 ; sk < key ; sk++)
					s >> skippedval ;

				double val ;
				s >> val ;
#ifdef TEST
				cerr << "pushing " << val << endl ;
#endif
				dat.push_back(val) ;
	
				/* compute a bin number for this value by linear interpolation
				* between the leftmost and rightmost edges of the first and last
				* bin. If the value falls outside these ranges, put it into either
				* the first or the last bin.
				*/
				int binnr = (val-vrange[0])/valBw ;
				if ( binnr >= STAT2BIN_MAXBINNR)
					binnr = STAT2BIN_MAXBINNR-1 ;
				if ( binnr < vrange[0])
					binnr = STAT2BIN_MAXBINNR-2 ;
				counts[binnr]++ ;
				countsm[2]++ ;
	
				/* update the sum over all input values and the sum over these squared
				*/
				moms[0] += val ;
				moms[1] += val*val ;
#ifdef TEST
				cerr << "plus " << binnr << " " << vrange[0] << " " << vrange[1] << endl ;
#endif
				countsm[1] = max(countsm[1],binnr) ;
				countsm[0] = min(countsm[0],binnr) ;
			}
			if ( f.eof() )
				break ;
		}
		f.close() ;
	}

	double qu[3] ;
	

	time_t now = time(0) ;
	cout << "# generated with " << argv[0] << " from " << argv[optind] << " " << ctime(&now) ;
	cout << "# " << countsm[2] << " counts between bins " << countsm[0] << " and " << countsm[1] << endl ;
#ifdef TEST
	cerr << "sum " << moms[0] << " " << moms[1] << endl ;
#endif

	/* mean is sum divided by the number of values.
	*  Std Dev is sqrt[sum(v-vbar)^2/(n-1)] = sqrt[(sum v^2 -2vbar*sum v+vbar^2*n)/(n-1)].
	*  = sqrt[(sum v^2 -2vbar*n*var+vbar^2*n)/(n-1)] = sqrt[(sum v^2 -vbar^2*n)/(n-1)]
	*/
	moms[0] = moms[0]/countsm[2] ;
	moms[1] = sqrt((moms[1]-countsm[2]*moms[0]*moms[0])/(countsm[2]-1)) ;
	cout << "# mean " <<  moms[0] << " std dev " << moms[1] << endl ;

	quanti(dat,qu) ;
	cout << "# median " <<  qu[0] << " + " << qu[2]-qu[0] << " - " << qu[0]-qu[1] << endl ;

	// output of the binning results
	int sumCounts=0 ;
	for(int i=countsm[0]; i <= countsm[1] ; i++)
	{
#ifdef STAT2BIN_ONE_OUT_PER_BIN
		sumCounts += counts[i] ;
		cout << vrange[0]+(i+0.5)*valBw << " " << counts[i] << " " << sumCounts << endl ;
#else
		cout << vrange[0]+i*valBw << " " << counts[i] << " " << sumCounts << endl ;
		sumCounts += counts[i] ;
		cout << vrange[0]+(i+1)*valBw << " " << counts[i] << " " << sumCounts << endl ;
#endif
	}

	return 0 ;
}

#undef STAT2BIN_ONE_OUT_PER_BIN
#undef STAT2BIN_MAXBINNR