/************************************************************ stat2Bin.cxx is a C++ program which counts a list of input values that have been sampled along one coordinate (by any other program) into bins of equal size. The input is an ASCII file with two types of lines i) lines starting with a hash (#) which are ignored ii) lines starting with any white space followed by a floating point number (like 1.23e5 or 3332 or -12e-5) optionally followed by more white space and other bytes which are ignored. These first numbers in the non-comment lines are the sample that is to be binned. The standard output contains some comments (i.e., lines that start with a hash so they are ignored by parsers like gnuplot) which contain the mean and standard deviation over the input samples (computed independent of the binning parameters). The output contains also lines with three columns which contain from the left to the right i) the center of the bin in the units of the input data. If the preprocessor variable STAT2BIN_ONE_OUT_PER_BIN is commented before compilation, the data are duplicated in the sense that two lines per bin will be produced with the first representing the (x,y) coordinate on the left upper corner of the bin graph, the 2nd representing the (x,y) coordinate of the right upper corner of the graph. ii) the positive integer number of samples falling into that bin iii) the cumulative sum over the data falling into this or any prior bin. This is useful to quickly get percentiles if the last number in the last row is subdivided and the result backwards interpolated from the last/third row into the first. The bins are listed continously, which means bins without a hit in the input data are explicitly listed with a zero in the second column. Complaints on use etc are forwarded to stderr. Compilation: make stat2Bin Some compilers may eject a warning about double-to-integer conversion. The coding is correct according to the standards and has not been adapted to such behavior. Syntax: stat2Bin [-b binwidth] [-f from] [-t to] [-k field] inputf [inputf2 ....] > outputf Command line options and arguments: -b the width of each bin in the units of the data of the input file. If the option is not used, a default of 1 is assumed. -f smallest input value at the "lower" edge of the first (leftmost) bin. If the value of this option happens to be larger than some of the values in the input file, the first bin piles up these too. If the option is not used, a default of 0 is assumed. -t highest input value at the "upper" edge of the last (rightmost) bin. If the value of this option happens to be smaller than some of the values in the input file, the last bin piles up these too. If the option is not used, a default of 100 is assumed. -k key is the field number of the input in each input line, where sequences of blanks are delimiters. In a left-to-right scan of the input lines, key-1 values are skipped before reading the value for the statistics If the option is not used, a default of 1 is assumed. inputf this is at least one mandatory command line argument which specifies the UNIX file name of the input data. If more arguments follow, the concatenation of all of them is used for the statistics. Example: cat wdsnew*.txt | wdsToSql.pl > wds.sql sqlite3 sqlite> .read wds.sql sqlite> .output wdstmp sqlite> SELECT sepfst FROM wds WHERE magfst < 10; sqlite> .output wdstmp2 sqlite> SELECT sepfst FROM wds WHERE magfst < 10 AND magsnd <10; sqlite> .read orb6.sql sqlite> .output wdstmp3 sqlite> SELECT a FROM orb6 WHERE magfst < '10' AND au='a' AND ( Pu='y' AND P<'4' AND P>'0.5' OR Pu='d' AND P>'182' AND P<'1460'); sqlite> SELECT 0.001*a FROM orb6 WHERE magfst < '10' AND au='m' AND ( Pu='y' AND P<'4' AND P>'0.5' OR Pu='d' AND P>'182' AND P<'1460'); sqlite> .output wdstmp4 sqlite> SELECT a FROM orb6 WHERE magfst < '10' AND au='a' AND ( Pu='y' AND P>='4' OR Pu='d' AND P>'1460'); sqlite> SELECT 0.001*a FROM orb6 WHERE magfst < '10' AND au='m' AND ( Pu='y' AND P>='4' OR Pu='d' AND P>'1460'); sqlite> .exit stat2Bin -b 2 -f 0.01 -t 120 wdstmp > wdsSepStat samples input data from 0 to 120 as into bins of width 2. Richard J. Mathar, http://www.mpia-hd.mpg.de/~mathar, 2021-07-29 *************************************************************/ #include #include #include #include #include #include #include #include #include #include #include #include /* maximum number of bins on output; increase in the extraordinary * case that this is not fine enough */ #define STAT2BIN_MAXBINNR 6000 /* generally, try to define steps such that * gnuplot> plot "outputf" wi li * will plot a nice histogram. The definition of the following macro assumes that * only one point per bin is generated, as with * gnuplot> set style data histeps * gnuplot> plot "outputf" */ #define STAT2BIN_ONE_OUT_PER_BIN using namespace std ; void usage(char *argv0) { cerr << "usage: " << argv0 << "[-b binwidth] [-f from] [-t to] infile > outfile" << endl ; } /** compute quantile statistics over the values of the vector * @param[in,out] dat the vector of the input values. On output, the * vector is sorted along increasing values. * @param[in] qu is a value between 0 and 1 corresponding to the * quantile to be computed. For the median use 0.5, for the two sigma * range (95.4 percent) use 0.977249868 or 0.022750132, for the three * sigma range (99.7 percent) use 0.9986501 or 0.0013499. * @return the abscissa value on the cumulative sum. On the curve of * the cumulative sum of the values in \c dat, 100*qu percent * of the values are smaller than the value returned. * @since 2007-02-12 * @author Richard J. Mathar */ double quanti(vector &dat, const double qu) { /** we first sort the values with a call to an STL sort() * such that a quick one-step interpolation between indices and values * returns the value. */ #ifdef TEST cout << "# unsorted " << endl ; for(int i=0 ; i < dat.size() ; i++) cout << i << " " << dat[i] << endl ; #endif sort(dat.begin(),dat.end()) ; #ifdef TEST cout << "# sorted " << endl ; for(int i=0 ; i < dat.size() ; i++) cout << i << " " << dat[i] << endl ; #endif const int N=dat.size() ; if( qu <= 0.) return 2.*dat[0]-dat[1] ; else if ( qu >= 1.) return 2.*dat[N-1]-dat[N-2] ; else { /* This \c indx says that \c fndx of the values (out of a total of \c N) * are to be left of the abscissa value (to be returned), the other right to it. * The subtraction of 1 indicates the conversion to the 0-based C/C++ indexing. * Example: if qu*N=4.2, the four values with C-indices 0,1,2 and 3 are to be * smaller than the value returned. */ double fndx = qu*N-1 ; int indx = fndx ; /* The return value is obtained by linear interpolation between * ( indx,dat[indx]) and (indx+1,dat[indx+1]). */ return dat[indx]+fmod(indx,1.)*(dat[indx+1]-dat[indx]) ; } } /** compute quantile statistics over the values of the vector * @param[in,out] dat the vector of the input values * @param[out] qu on return, qu[0] contains the median (50 percent quantile), * qu[0] contains the 15.86 percent quantile (which is the 1 sigma value * of a normal distribution, qu[1] contains the 84.13 percent quantile (which * is the 1 sigma value of a normal distribution. * @since 2007-02-12 * @author Richard J. Mathar */ void quanti(vector &dat, double qu[3]) { qu[0]=quanti(dat,0.5) ; qu[1]=quanti(dat,0.158655254) ; qu[2]=quanti(dat,0.841344746) ; } /** Main executable of stat2Bin. * @param[in] argc the number of UNIX command line arguments, including * the name of the executable. Correct use means this parameter is between * 2 (only the infile is given) up to 8 (all three options are used). * @param[in] argv pointers to the starts of the command line arguments. * @return 0 if successful, 1 if not. * The return value of 1 indicates either * - the mandatory command line argument with the file name is missing * - the file name does not refer to a readable file * @since 2005-11-07 * @since 2020-07-29 remove dependency on input line lengths * @since 2020-07-29 Accumulate more than one input file. * @since 2020-07-29 Support -k option to skip columns * @author Richard J. Mathar */ int main(int argc, char *argv[]) { float valBw = 1. ; // default is binning in slices of 1. "data units" int key=1 ; /* sampling range. vrange[0] the minimum value, default zero, * and vrange[1] the maximum value, default 100. */ double vrange[2] ={0., 100.} ; if( argc < 2) { usage(argv[0]) ; return 1 ; } /* The flag of any command line option */ char oc ; while ( (oc=getopt(argc,argv,"b:f:t:k:")) != -1 ) { switch(oc) { case 'f' : vrange[0] = atof(optarg) ; break ; case 'b' : valBw = atof(optarg) ; break ; case 't' : vrange[1] = atof(optarg) ; break ; case 'k' : key = atoi(optarg) ; break ; case '?' : cerr << "Invalid command line option " << optopt << endl ; break ; } } /* \c counts is the array of bin occupancies, initially zero * (the initialization is not needed with C++ compilers that comply...) */ int counts[STAT2BIN_MAXBINNR] ; for(int i=0 ; i < STAT2BIN_MAXBINNR; i++) counts[i] =0 ; /* countsm[0] keeps track of the minimum bin index that has * at least occupancy 1. countsm[1] keeps track of the maximum index that * has at least occupancy 1. The total number of input values is kept * in countsm[2]. */ int countsm[3] ; countsm[0]= STAT2BIN_MAXBINNR ; countsm[1]=countsm[2]=0 ; /* for statistical analysis: sum of the values and sum of the values squared * No values have been read yet and these start at zero. */ double moms[2] = {0., 0.} ; /** For some kind of higher statistics, we accumulate the entire input * vector in \c dat */ vector dat ; /* loop over the input files */ for(int i = optind ; i < argc ; i++) { ifstream f; f.open(argv[i]) ; if ( ! f) { cerr << "Coudn't open " << argv[i] << endl ; continue ; } for(;;) { string buf ; /* read the next line from the input file */ getline(f,buf) ; /* skip it if it starts with a hash */ if ( strncmp(buf.c_str(),"#",1) && buf.length()>0 ) // a line that does not start with the hash { #ifdef TEST cerr << "reading " << buf << endl ; #endif /* read the value into \c val . Skip some if key>1. */ string skippedval ; istringstream s(buf) ; for(int sk=1 ; sk < key ; sk++) s >> skippedval ; double val ; s >> val ; #ifdef TEST cerr << "pushing " << val << endl ; #endif dat.push_back(val) ; /* compute a bin number for this value by linear interpolation * between the leftmost and rightmost edges of the first and last * bin. If the value falls outside these ranges, put it into either * the first or the last bin. */ int binnr = (val-vrange[0])/valBw ; if ( binnr >= STAT2BIN_MAXBINNR) binnr = STAT2BIN_MAXBINNR-1 ; if ( binnr < vrange[0]) binnr = STAT2BIN_MAXBINNR-2 ; counts[binnr]++ ; countsm[2]++ ; /* update the sum over all input values and the sum over these squared */ moms[0] += val ; moms[1] += val*val ; #ifdef TEST cerr << "plus " << binnr << " " << vrange[0] << " " << vrange[1] << endl ; #endif countsm[1] = max(countsm[1],binnr) ; countsm[0] = min(countsm[0],binnr) ; } if ( f.eof() ) break ; } f.close() ; } double qu[3] ; time_t now = time(0) ; cout << "# generated with " << argv[0] << " from " << argv[optind] << " " << ctime(&now) ; cout << "# " << countsm[2] << " counts between bins " << countsm[0] << " and " << countsm[1] << endl ; #ifdef TEST cerr << "sum " << moms[0] << " " << moms[1] << endl ; #endif /* mean is sum divided by the number of values. * Std Dev is sqrt[sum(v-vbar)^2/(n-1)] = sqrt[(sum v^2 -2vbar*sum v+vbar^2*n)/(n-1)]. * = sqrt[(sum v^2 -2vbar*n*var+vbar^2*n)/(n-1)] = sqrt[(sum v^2 -vbar^2*n)/(n-1)] */ moms[0] = moms[0]/countsm[2] ; moms[1] = sqrt((moms[1]-countsm[2]*moms[0]*moms[0])/(countsm[2]-1)) ; cout << "# mean " << moms[0] << " std dev " << moms[1] << endl ; quanti(dat,qu) ; cout << "# median " << qu[0] << " + " << qu[2]-qu[0] << " - " << qu[0]-qu[1] << endl ; // output of the binning results int sumCounts=0 ; for(int i=countsm[0]; i <= countsm[1] ; i++) { #ifdef STAT2BIN_ONE_OUT_PER_BIN sumCounts += counts[i] ; cout << vrange[0]+(i+0.5)*valBw << " " << counts[i] << " " << sumCounts << endl ; #else cout << vrange[0]+i*valBw << " " << counts[i] << " " << sumCounts << endl ; sumCounts += counts[i] ; cout << vrange[0]+(i+1)*valBw << " " << counts[i] << " " << sumCounts << endl ; #endif } return 0 ; } #undef STAT2BIN_ONE_OUT_PER_BIN #undef STAT2BIN_MAXBINNR