From: Francois Fleuret Date: Mon, 30 Mar 2009 09:11:57 +0000 (+0200) Subject: Initial commit. X-Git-Url: https://ant.fleuret.org/cgi-bin/gitweb/gitweb.cgi?a=commitdiff_plain;h=3201af50a66f14e327f41549c183871e837556c8;p=data-tool.git Initial commit. --- 3201af50a66f14e327f41549c183871e837556c8 diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..cdd5ce6 --- /dev/null +++ b/Makefile @@ -0,0 +1,50 @@ + +# +# data-tool is a command line tool to do simple statistical +# processing on numerical data. +# +# Copyright (c) 2009 Francois Fleuret +# Written by Francois Fleuret +# +# This file is part of data-tool. +# +# data-tool is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 3 as +# published by the Free Software Foundation. +# +# data-tool is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with data-tool. If not, see . +# + +ifeq ($(DEBUG),yes) + OPTIMIZE_FLAG = -ggdb3 -DDEBUG +else + OPTIMIZE_FLAG = -ggdb3 -O3 +endif + +ifeq ($(PROFILE),yes) + PROFILE_FLAG = -pg +endif + +CXXFLAGS = -Wall $(OPTIMIZE_FLAG) $(PROFILE_FLAG) + +all: data-tool + +TAGS: *.cc *.h + etags --members -l c++ *.cc *.h + +data-tool: data-tool.o + $(CXX) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) + +Makefile.depend: *.h *.cc Makefile + $(CC) -M *.cc > Makefile.depend + +clean: + \rm -f data-tool *.o Makefile.depend TAGS + +-include Makefile.depend diff --git a/data-tool.cc b/data-tool.cc new file mode 100644 index 0000000..bcf19b3 --- /dev/null +++ b/data-tool.cc @@ -0,0 +1,449 @@ + +/* + * data-tool is a command line tool to do simple statistical + * processing on numerical data. + * + * Copyright (c) 2009 Francois Fleuret + * Written by Francois Fleuret + * + * This file is part of data-tool. + * + * data-tool is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 3 as + * published by the Free Software Foundation. + * + * data-tool is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with data-tool. If not, see . + * + */ + +#include +#include +#include +#include + +using namespace std; + +struct Couple { + int index; + double value; +}; + +int compare_couple(const void *a, const void *b) { + if(((Couple *) a)->value < ((Couple *) b)->value) return -1; + else if(((Couple *) a)->value > ((Couple *) b)->value) return 1; + else return 0; +} + +double *inflate_array(double *x, int current_size, int new_size) { + double *xx = new double[new_size]; + for(int n = 0; n < current_size; n++) xx[n] = x[n]; + delete[] x; + return xx; +} + +char *next_word(char *buffer, char *r, int buffer_size) { + char *s; + s = buffer; + if(r != NULL) + { + if(*r == '"') { + r++; + while((*r != '"') && (*r != '\0') && + (s= argc) { + cerr << "ERROR: Missing argument for " << argv[n_opt] << ". Expecting " << help << "." << endl; + exit(1); + } +} + +void print_help_and_exit(int e) { + cout << "Simple data processing tool. Written by Francois Fleuret." << endl + << endl + << "This application takes data from the standard input and prints" << endl + << "the result on the standard output. It expects either a list of" << endl + << "float values (to produce histograms, cumulative distribution functions" << endl + << "or the mean, variance, etc.) or a list of couples of values of the form" << endl + << "x y on each line (where the sign of x tells the class and y the parameter" << endl + << "value) to compute the ROC curve or the ROC curve surface.\n" << endl + << "The options are:" << endl + << " --help" << endl + << " --roc" << endl + << " --roc-surface" << endl + << " --normalize" << endl + << " --histo" << endl + << " --cumul" << endl + << " --misc" << endl + << " --auto-extrema" << endl + << " --xbounds " << endl + << " --ybounds " << endl + << " --nb-bins " << endl; + exit(e); +} + +void check_single_processing(bool unknown_processing) { + if(!unknown_processing) { + cerr << "ERROR: You can't do two different processings." << endl; + exit(1); + } +} + +int main(int argc, char **argv) { + double xmin = 0, xmax = 1, ymin = 0, ymax = 1; + int nb_bins = 10; + const int buffer_size = 1024; + + char line[buffer_size], token[buffer_size]; + bool auto_extrema = false; + bool normalize = false; + + int i = 1; + + enum { UNKNOWN, ROC, ROC_SURFACE, HISTO, CUMUL, MISC } processing = UNKNOWN; + + // Parsing the command line arguments //////////////////////////////// + + while(i < argc) { + + if(argc == 1 || strcmp(argv[i], "--help") == 0) print_help_and_exit(0); + + else if(strcmp(argv[i], "--roc") == 0) { + check_single_processing(processing == UNKNOWN); + processing = ROC; + i++; + } + + else if(strcmp(argv[i], "--roc-surface") == 0) { + check_single_processing(processing == UNKNOWN); + processing = ROC_SURFACE; + i++; + } + + else if(strcmp(argv[i], "--cumul") == 0) { + check_single_processing(processing == UNKNOWN); + processing = CUMUL; + i++; + } + + else if(strcmp(argv[i], "--normalize") == 0) { + normalize = true; + i++; + } + + else if(strcmp(argv[i], "--histo") == 0) { + check_single_processing(processing == UNKNOWN); + processing = HISTO; + i++; + } + + else if(strcmp(argv[i], "--misc") == 0) { + check_single_processing(processing == UNKNOWN); + processing = MISC; + i++; + } + + else if(strcmp(argv[i], "--auto-extrema") == 0) { + auto_extrema = true; + i++; + } + + else if(strcmp(argv[i], "--xbounds") == 0) { + check_opt(argc, argv, i, 2, " "); + xmin = atof(argv[i+1]); + xmax = atof(argv[i+2]); + if(xmin >= xmax) { + cerr << "ERROR: Incorrect bounds." << endl; + exit(1); + } + i += 3; + } + + else if(strcmp(argv[i], "--ybounds") == 0) { + check_opt(argc, argv, i, 2, " "); + ymin = atof(argv[i+1]); + ymax = atof(argv[i+2]); + if(ymin >= ymax) { + cerr << "ERROR: Incorrect bounds." << endl; + exit(1); + } + i += 3; + } + + else if(strcmp(argv[i], "--nb-bins") == 0) { + check_opt(argc, argv, i, 1, ""); + nb_bins = atoi(argv[i+1]); + if(nb_bins < 1) { + cerr << "ERROR: Incorrect number of bins." << endl; + exit(1); + } + i += 2; + } + + else { + cerr << "ERROR: Unknown option " << argv[i] << endl; + print_help_and_exit(1); + } + } + + // Processing the data /////////////////////////////////////////////// + + switch(processing) { + + case CUMUL: + + { + int nb_samples = 0, nb_samples_max = 50000; + double *x = new double[nb_samples_max]; + + while(!cin.eof()) { + if(nb_samples == nb_samples_max) { + x = inflate_array(x, nb_samples_max, 2 * nb_samples_max); + nb_samples_max = 2 * nb_samples_max; + } + + cin.getline(line, buffer_size); + + if(line[0]) { + char *s = line; + s = next_word(token, s, buffer_size); + x[nb_samples] = atof(token); + nb_samples++; + } + } + + Couple tmp[nb_samples]; + for(int n = 0; n < nb_samples; n++) { + tmp[n].index = n; + tmp[n].value = x[n]; + } + + qsort(tmp, nb_samples, sizeof(Couple), compare_couple); + + for(int n = 0; n < nb_samples; n++) + cout << tmp[n].value << " " << double(n)/double(nb_samples) << endl; + + delete[] x; + + } + + break; + + case ROC: + case ROC_SURFACE: + + { + int nb_samples = 0, nb_samples_max = 1000; + double *x = new double[nb_samples_max], *y = new double[nb_samples_max]; + + while(!cin.eof()) { + if(nb_samples == nb_samples_max) { + x = inflate_array(x, nb_samples_max, 2 * nb_samples_max); + y = inflate_array(y, nb_samples_max, 2 * nb_samples_max); + nb_samples_max = 2 * nb_samples_max; + } + + cin.getline(line, buffer_size); + + if(line[0]) { + char *s = line; + s = next_word(token, s, buffer_size); + x[nb_samples] = atof(token); + s = next_word(token, s, buffer_size); + y[nb_samples] = atof(token); + nb_samples++; + } + } + + Couple tmp[nb_samples]; + int nb_rn = 0, nb_rp = 0, nb_fp = 0, nb_fn = 0; + + bool binary = true; + for(int n = 0; binary && n < nb_samples; n++) binary &= (x[n] == 0 || x[n] == 1); + if(binary) { + cerr << "WARNING: your classes are binary, I process them accordingly." << endl; + for(int n = 0; n < nb_samples; n++) x[n] = 2 * x[n] - 1; + } + + for(int n = 0; n < nb_samples; n++) { + tmp[n].index = n; + tmp[n].value = y[n]; + if(x[n] >= 0) nb_rp++; + else { nb_rn++; nb_fp++; } + } + + if(nb_rp == 0) cerr << "WARNING: No true positive." << endl; + if(nb_rn == 0) cerr << "WARNING: No true negative." << endl; + + qsort(tmp, nb_samples, sizeof(Couple), compare_couple); + + if(processing == ROC) { + for(int n = 0; n < nb_samples - 1; n++) { + if(x[tmp[n].index] >= 0) nb_fn++; + else nb_fp--; + if(tmp[n].value < tmp[n+1].value) { + cout << double(nb_fp)/double(nb_rn) << " " + << 1 - double(nb_fn) / double(nb_rp) << " " + << (tmp[n].value + tmp[n+1].value) << " " + << endl; + } + } + } else { + double surface = 0; + double cx = double(nb_fp)/double(nb_rn), cy = 1 - double(nb_fn) / double(nb_rp); + for(int n = 0; n < nb_samples - 1; n++) { + if(x[tmp[n].index] >= 0) nb_fn++; + else nb_fp--; + if(tmp[n].value < tmp[n+1].value) { + double ncx = double(nb_fp)/double(nb_rn), ncy = 1 - double(nb_fn) / double(nb_rp); + surface += (cx - ncx) * cy; + cx = ncx; cy = ncy; + } + } + cout << surface << endl; + } + + delete[] x; delete[] y; + + } + + break; + + case HISTO: + + { + int nb_samples = 0, nb_samples_max = 1000; + double *x = new double[nb_samples_max]; + + while(!cin.eof()) { + if(nb_samples == nb_samples_max) { + x = inflate_array(x, nb_samples_max, 2 * nb_samples_max); + nb_samples_max = 2 * nb_samples_max; + } + + cin.getline(line, buffer_size); + + if(line[0]) { + char *s = line; + s = next_word(token, s, buffer_size); + x[nb_samples] = atof(token); + if(auto_extrema) { + if(nb_samples == 0 || x[nb_samples] > xmax) xmax = x[nb_samples]; + if(nb_samples == 0 || x[nb_samples] < xmin) xmin = x[nb_samples]; + } + nb_samples++; + } + } + + int nb[nb_bins]; + for(int n = 0; n < nb_bins; n++) nb[n] = 0; + + int nb_total = 0; + for(int s = 0; s < nb_samples; s++) { + int n = int((x[s] - xmin)/(xmax - xmin) * nb_bins); + if(n >= 0 && n < nb_bins) nb[n]++; + else { + cerr << "WARNING: value " << x[s] << " is out of histogram." << endl; + } + nb_total++; + } + + if(normalize) { + for(int n = 0; n < nb_bins; n++) + cout << xmin + ((xmax - xmin) * n) / double(nb_bins) << " " + << (nb[n] / double(nb_total))/((xmax - xmin) / double(nb_bins)) << endl; + } else { + for(int n = 0; n < nb_bins; n++) + cout << xmin + ((xmax - xmin) * n) / double(nb_bins) << " " + << nb[n] / double(nb_total) << endl; + } + } + + break; + + case MISC: + + { + int nb_samples = 0, nb_samples_max = 1000; + double *x = new double[nb_samples_max]; + int nb = 0; + double min = 0, max = 0; + double sum = 0, sumsq = 0; + + while(!cin.eof()) { + if(nb_samples == nb_samples_max) { + x = inflate_array(x, nb_samples_max, 2 * nb_samples_max); + nb_samples_max = 2 * nb_samples_max; + } + + cin.getline(line, buffer_size); + char *s = line; + if(line[0]) { + s = next_word(token, s, buffer_size); + x[nb_samples] = atof(token); + nb_samples++; + double x = atof(token); + if(nb == 0 || x > max) max = x; + if(nb == 0 || x < min) min = x; + sum += x; + sumsq += x*x; + nb++; + } + } + + Couple tmp[nb_samples]; + for(int n = 0; n < nb_samples; n++) { + tmp[n].index = n; + tmp[n].value = x[n]; + } + + qsort(tmp, nb_samples, sizeof(Couple), compare_couple); + + delete[] x; + + double mu = sum / double(nb); + double sigma = (sumsq - sum * mu) / double(nb - 1); + double stdd = sqrt(sigma); + + cout << "MIN " << min + << " MAX " << max + << " MU " << mu + << " SIGMA " << sigma + << " STDD " << stdd + << " SUM " << sum + << " MEDIAN " << tmp[nb_samples/2].value + << " QUANTILE0.1 " << tmp[int(nb_samples * 0.1)].value + << " QUANTILE0.9 " << tmp[int(nb_samples * 0.9)].value + << endl; + + } + + break; + + default: + cerr << "ERROR: You must choose a processing type." << endl; + exit(1); + } + +}