/*
 * This program implements the Evolutionary Fuzzy Clustering algorithm as
 * published by B.P.P. van Houte and J. Heringa in Bioinformatics, 26(1): 6-14.
 *
 * Please refer to the manual with this program for its usage.
 */


#include <iostream>
#include <fstream>
#include <cstdlib>
#include <cstring>
#include <cstdio>
#include <ctime>
#include <cmath>

using namespace std;


#define QUICK_SORT_STACK  50

#define TINY              1e-20
#define SMALL             0.0001

#define MAX_SZ_LABEL      100   // maximum number of chars in a data label
#define MAX_NR_CASES      25000

#define POPULATION_SZ     15
#define OFFSPRING_SZ      100  // MUST be even, due to recombination
#define MAX_GENERATIONS   100
#define M                 1.5

#define EUCLIDEAN_DIST    0
#define PEARSON_DIST      1
#define SPEARMAN_DIST     2
#define AGREEMENT_DIST    3
#define CONCORDANCE_DIST  4


int nrCases, nrAttributes, generation, distanceMetric, nrClustersFixed;


/*
 * Global variable to hold the data set.
 */
struct DataType {
	char  **labels;
	float **values;
} data;


/*
 * Global variable to hold the current population. Each individual
 * within the population corresponds with one clustering, represented
 * by cluster-centroids.
 */
struct Individual {
	int nrClusters;
	float **centroids;
	float fitness;
} *population;


/*
 * Prints the usage of this program to standard error
 */
void printUsage () {
	cerr << "Usage: ./EFC <distance measure> <nr clusters> <file>\n\n";
	cerr << "  Distance measures available:\n";
	cerr << "   - \"e\" (Euclidean distance)\n";
	cerr << "   - \"p\" (Pearson distance)\n";
	cerr << "   - \"s\" (Spearman distance)\n";
	cerr << "   - \"a\" (Agreement distance)\n";
	cerr << "   - \"c\" (Concordance distance)\n\n";
}


/*
 * Skip comments at the input stream `infile'.
 */
void skipComments (ifstream &infile) {
	string line;
	while (infile >> ws && infile.peek() == '#') {
		getline(infile, line);
	}
}


/*
 * Skip white space at the input stream `infile'.
 */
void skipWhiteSpace (ifstream &infile) {
	while (infile.peek() == ' ' || infile.peek() == '\t') {
		infile.get();
	}
}


/*
 * Returns true if the pointer of filestream `infile' is at an end of
 * line character, or is at the end of file. Returns false otherwise.
 * Skips white space first.
 */
bool eoln (ifstream &infile) {
	skipWhiteSpace(infile);
	char nextChar = infile.peek();
	return nextChar == '\r' || nextChar == '\n' || nextChar == EOF;
}


/*
 * Read a data label (class name) from filestream `infile'. The label
 * should be delimited by a space or a tab. The result is put into
 * a string pointed to by `label'.
 */
void readLabel (ifstream &infile, char label[]) {
	char c;
	int i = 0;
	while (infile.get(c) && c != ' ' && c!= '\t' && c != EOF) {
		label[i++] = c;
	}
	label[i] = '\0';
}


/*
 * Determine the number of attributes in the data set, pointed to
 * by the input stream `infile'. The counted number of attributes
 * is returned.
 */
int getNrAttributes (ifstream &infile) {
	int result = 0;
	float value;
	char *label = new char[MAX_SZ_LABEL];
	
	int origPos = infile.tellg();
	readLabel(infile, label);
	while (!eoln(infile)) {
		infile >> value;
		result++;
	}
	infile.seekg(origPos);
	return result;
}


/*
 * Read the data file, with name `fileName'. The data is put into
 * the structure pointed to by `data'. The number of attributes
 * within the data set is returned.
 * 
 */
int readDataFile (const char *fileName, DataType *data) {
	ifstream dataFile(fileName);
	int nrAttributes;
	
	// Can I open the file?
	if (!dataFile.is_open()) {
		cerr << "Error: Unable to open file: " << fileName << '\n';
		exit(1);
	}
	
	skipComments(dataFile);
	nrAttributes = getNrAttributes(dataFile);

	// Read the data
	data->labels = new char *[MAX_NR_CASES];
	data->values = new float *[MAX_NR_CASES];
	while (!dataFile.eof()) {
		data->labels[nrCases] = new char[MAX_SZ_LABEL];
		data->values[nrCases] = new float[nrAttributes];
		
		// Read label
		skipWhiteSpace(dataFile);
		readLabel(dataFile, data->labels[nrCases]);
		
		// Read attribute values
		for (int i = 0; i < nrAttributes; i++) {
			if (!eoln(dataFile)) {
				dataFile >> data->values[nrCases][i];
			} else {
				cerr << "Error: Too few attributes in case ";
				cerr << nrCases + 1 << "\n";
				exit(1);
			}
		}
		
		// Do some checks
		if (eoln(dataFile)) {
			dataFile >> ws; // read eoln
		} else {
			cerr << "Error: Too many attributes in case ";
			cerr << nrCases + 1 << "\n";
			exit(1);
		}
		if (++nrCases > MAX_NR_CASES) {
			cerr << "Error: Too many cases in data file\n";
			cerr << "(> " << MAX_NR_CASES << ")" << "\n";
			exit(1);
		}
	}
	return (nrAttributes);
}


/*
 * Return the square of `x'.
 */
inline float square (float x) {
	return (x * x);
}


/*
 * Remove the flat profiles from the data set, pointed to by `data'. The
 * cases with a variance less than some threshold are filtered out. These
 * cases, the so called flat profiles, are written to standard output.
 */
int removeFlatProfiles (DataType *data, const int size) {
	int localNrCases = size;
	bool noneFound = true;
	
	cout << "\n===> REMOVED FLAT PROFILES (std.dev. <= 0.05)\n\n";
	for (int i=0; i<localNrCases; i++) {
		// Compute average
		float sum = 0;
		for (int j=0; j<nrAttributes; j++) {
			sum += data->values[i][j];
		}
		float avg = sum / nrAttributes;
	
		// Compute standard deviation
		sum = 0;
		for (int j=0; j<nrAttributes; j++) {
			sum += square(data->values[i][j] - avg);
		}
		float stddev = sqrt(sum / (nrAttributes - 1));

		// Remove the flat profiles
		if (stddev <= 0.05) {
			if (noneFound) {
				cout << "              Sample\tStd.dev.\n";
				noneFound = false;
			}
			localNrCases--;
			printf("%20s\t%.4f\n", data->labels[i], stddev);
			data->values[i] = data->values[localNrCases];
			data->labels[i] = data->labels[localNrCases];
			i--;
		}
	}
	if (noneFound) { // if no flat profiles found
		cout << "\t<none>\n";
	}
	
	return (localNrCases);
}


/*
 * Returns a (pseudo) random integer between and including `min'
 * and `max'.
 */
inline int randomInt (int min, int max) {
	return (int) ((max - min) * drand48() + min + 0.5);
}


/*
 * Initialize the population pointed to by `p', containing `size'
 * individuals. Each individual within `p' is assigned a clustering
 * containing a random number of clusters. Each cluster within
 * the clusterings is initialized by assigning a random case from
 * the data set to it. 
 */
void initialize (Individual **p, int size) {
	int randomCase;
	
	// Initialise population
	*p = new Individual[size];
	for (int i=0; i<size; i++) {
		(*p)[i].nrClusters = nrClustersFixed;
		(*p)[i].centroids = new float *[nrClustersFixed];
		for (int j=0; j<nrClustersFixed; j++) {
			(*p)[i].centroids[j] = new float[nrAttributes];
			randomCase = randomInt(0, nrCases-1);
			for (int k=0; k<nrAttributes; k++) {
				(*p)[i].centroids[j][k] = data.values[randomCase][k];
			}
		}
	}
}


/*
 * Swaps the elements pointed to by `i' and `j'.
 */
inline void swap (float &i, float &j) {
	float temp = i;
	i = j;
	j = temp;
}


/*
 * Sorts the elements of vector `x' with the QuickSort algorithm. The
 * elements of vector `y' are moved at the same time as we sort `x'. If
 * the number of elements to be sorted drops below 7, insertion sort
 * is used.
 */
void quickSort2 (float x[], float y[], int size) {
	int i, ir = size-1, j, k, l = 0, *istack;
	int jstack = 0;
	float pivotX, pivotY;

	istack = new int[QUICK_SORT_STACK];
	for (;;) {
		if (ir-l < 7) {
			for (j=l+1; j<=ir; j++) {
				pivotX = x[j];
				pivotY = y[j];
				for (i=j-1; i>=l; i--) {
					if (x[i] <= pivotX) break;
					x[i+1] = x[i];
					y[i+1] = y[i];
				}
				x[i+1] = pivotX;
				y[i+1] = pivotY;
			}
			if (!jstack) {
				delete(istack);
				return;
			}
			ir = istack[jstack];
			l = istack[jstack-1];
			jstack -= 2;
		} else {
			k = (l+ir) >> 1;
			swap(x[k], x[l+1]);
			swap(y[k], y[l+1]);
			if (x[l] > x[ir]) {
				swap(x[l], x[ir]);
				swap(y[l], y[ir]);
			}
			if (x[l+1] > x[ir]) {
				swap(x[l+1], x[ir]);
				swap(y[l+1], y[ir]);
			}
			if (x[l] > x[l+1]) {
				swap(x[l], x[l+1]);
				swap(y[l], y[l+1]);
			}
			i = l + 1;
			j = ir;
			pivotX = x[l+1];
			pivotY = y[l+1];
			for (;;) {
				do i++; while (x[i] < pivotX);
				do j--; while (x[j] > pivotX);
				if (j < i) break;
				swap(x[i], x[j]);
				swap(y[i], y[j]);
			}
			x[l+1] = x[j];
			x[j] = pivotX;
			y[l+1] = y[j];
			y[j] = pivotY;
			jstack += 2;
			if (jstack > QUICK_SORT_STACK) {
				cerr << "QUICK_SORT_STACK too small in routine quickSort2.\n";
				exit(1);
			}
			if (ir-i+1 >= j-l) {
				istack[jstack] = ir;
				istack[jstack-1] = i;
				ir = j - 1;
			} else {
				istack[jstack] = j - 1;
				istack[jstack-1] = l;
				l = i;
			}
		}
	}
}


/*
 * Replace each element in sorted vector `x' with it's rank.
 */
float crank (float x[], int size) {
	int j = 0, ji, jt;
	float s = 0.0;

	while (j < size-1) {
		if (x[j+1] != x[j]) {
			x[j] = ++j;
		} else {
			for (jt=j+1; jt<size && x[jt]==x[j]; jt++);
			float rank = 0.5 * (j+jt+1);
			for (ji=j; ji<jt; ji++) x[ji] = rank;
			float t = jt - j;
			s += t * t * t - t;
			j = jt;
		}
	}
	if (j == size-1) x[size-1] = size;
	
	return (s);
}


/*
 * Determine the squared distance between two vectors `x' and `y',
 * according to some distance measure. In this case, the Spearman
 * rank correlation is used. The resulting squared distance is
 * returned.
 */
float distanceSquareSpearman (float x[], float y[], int size) {
	// Determine the ranks
	float *ranksX = new float[size];
	float *ranksY = new float[size];
	for (int i=0; i<size; i++) {
		ranksX[i] = x[i];
		ranksY[i] = y[i];
	}
	quickSort2(ranksX, ranksY, size);
	float sf = crank(ranksX, size);
	quickSort2(ranksY, ranksX, size);
	float sg = crank(ranksY, size);
	
	// Calculate the Spearman rank correlation r
	float d = 0.0;
	for (int i=0; i<size; i++) {
		d += square(ranksX[i] - ranksY[i]);
	}

	float en3n = (double) size * size * size - size;
	float fac = (1.0 - sf/en3n) * (1.0 - sg/en3n);
	float r = (1.0 - (6.0/en3n) * (d+(sf+sg)/12.0)) / sqrt(fac);

	// Release memory
	delete(ranksY);
	delete(ranksX);

	// We want distance instead of similarity, so return 1-r
	if (r == 1) {
		return (TINY); // Prevents division by zero elsewhere
	} else {
		return (square(1 - r));
	}
}


/*
 * Determine the squared distance between two vectors `x' and `y',
 * according to some distance measure. In this case, the Pearson
 * distance is used. The resulting squared distance is returned.
 */
inline float distanceSquarePearson (float x[], float y[], int size) {
	// Compute sums
	float s_x = 0, s_y = 0, s_xy = 0, ss_x = 0, ss_y = 0;
	for (int i=0; i<size; i++) {
		s_x += x[i];
		s_y += y[i];
		s_xy += x[i] * y[i];
		ss_x += square(x[i]);
		ss_y += square(y[i]);
	}
	float ss_xx = ss_x - ((float) square(s_x) / size);
	float ss_yy = ss_y - ((float) square(s_y) / size);
	float ss_xy = s_xy - ((float) (s_x * s_y) / size);
	
	// Determine correlation coefficient r
	float r;
	if (ss_xx == 0 && ss_yy == 0) {
		r = 1;
	} else if (ss_xx == 0 || ss_yy == 0) {
		r = 0;
	} else {
		r = ss_xy / (sqrt(ss_xx) * sqrt(ss_yy));
	}
	
	// We want distance instead of similarity, so return 1-r
	if (r == 1) {
		return (TINY); // Prevents division by zero elsewhere
	} else {
		return (square(1 - r));
	}
}


/*
 * Determine the squared distance between two vectors `x' and `y',
 * according to some distance measure. In this case, the Euclidean
 * distance is used. The resulting squared distance is returned.
 */
inline float distanceSquareEuclidean (float x[], float y[], int size) {
	float result = 0;
	
	// Euclidean distance
	for (int i=0; i<size; i++) {
		result += square(x[i] - y[i]);
	}
	
	// Prevents division by zero
	if (result == 0) {
		result = TINY;
	}
	return result;
}

/*
 * Determine the squared distance between two vectors `x' and `y',
 * according to some distance measure. In this case, the Agreement
 * distance is used (WECCA). The resulting squared distance is returned.
 */
inline float distanceSquareAgreement (float x[], float y[], int size) {
	float result = 0;
	
	// Agreement
	for (int i=0; i<size; i++) {
		if (x[i] == y[i]) {
			result++;
		}
	}
	result /= size;
	
	// We want distance instead of similarity, so return 1-Agreement
	if (result == 1) {
		return (TINY); // Prevents division by zero elsewhere
	} else {
		return (square(1 - result));
	}
}

/*
 * Determine the squared distance between two vectors `x' and `y',
 * according to some distance measure. In this case, the Concordance
 * distance is used (WECCA). The resulting squared distance is returned.
 */
inline float distanceSquareConcordance (float x[], float y[], int size) {
	float result = 0;
	
	// Concordance
	for (int i=0; i<size; i++) {
		for (int j=i+1; j<size; j++) {
			if (x[i] < x[j] && y[i] < y[j]) {
				result++;
			} else if (x[i] > x[j] && y[i] > y[j]) {
				result++;
			} else if (x[i] == y[i] && x[j] == y[j]) {
				result++;
			}
		}
	}
	result /= size * (size-1);
	
	// We want distance instead of similarity, so return 1-Concordance
	if (result == 1) {
		return (TINY); // Prevents division by zero elsewhere
	} else {
		return (square(1 - result));
	}
}

/*
 * Determine the squared distance between two vectors `x' and `y',
 * according to some distance measure.
 */
inline float distanceSquare (float x[], float y[], int size) {
	if (distanceMetric == EUCLIDEAN_DIST) {
		return (distanceSquareEuclidean(x, y, size));
	} else if (distanceMetric == PEARSON_DIST) {
		return (distanceSquarePearson(x, y, size));
	} else if (distanceMetric == SPEARMAN_DIST) {
		return (distanceSquareSpearman(x, y, size));
	} else if (distanceMetric == AGREEMENT_DIST) {
		return (distanceSquareAgreement(x, y, size));
	} else if (distanceMetric == CONCORDANCE_DIST) {
		return (distanceSquareConcordance(x, y, size));
	}
	cerr << "ERROR: usage of unknown distance metric\n";
	exit(1);
}

/*
 * Removes duplicate centroids within each individual clustering
 * within the population pointed to by `p'.
 */
void removeDuplicateCentroids (Individual **p, int size) {
	for (int i=0; i<size; i++) {
		Individual *curr = &((*p)[i]);
		for (int j=0; j<curr->nrClusters; j++) {
			for (int k=0; k<j; k++) {
				if (distanceSquare(curr->centroids[j], curr->centroids[k],
				                                    nrAttributes) <= TINY) {
					curr->centroids[j--] = curr->centroids[--curr->nrClusters];
					break;
				}
			}
		}
	}
}


/*
 * Calculate the objective function value of the clustering
 * `gene', with respect to the data set `data'..
 */
float objectiveFunction (DataType data, Individual gene) {
	float result = 0;
	float w[nrCases]; // weigths
	float d[gene.nrClusters][nrCases]; // squared distances
	float u[gene.nrClusters][nrCases]; // membership degrees
	
	// Undesirable clusterings get a huge penalty
	if (gene.nrClusters <= 1 || gene.nrClusters != nrClustersFixed) {
		return HUGE_VAL;
	}

	// Calculate the squared distances
	for (int i=0; i<gene.nrClusters; i++) {
		for (int j=0; j<nrCases; j++) {
			d[i][j] = distanceSquare(gene.centroids[i],
			                         data.values[j], nrAttributes);
		}
	}
	
	// Calculate membership degrees
	for (int i=0; i<gene.nrClusters; i++) {
		for (int j=0; j<nrCases; j++) {
			float sum = 0;
			for (int k=0; k<gene.nrClusters; k++) {
				sum += pow(d[i][j]/d[k][j], (float) 1.0/(M-1));
			}
			u[i][j] = 1/sum;
		}
	}
	
	// Calculate weigths
	float numerator = 0;
	for (int i=0; i<nrCases; i++) {
		float sum = 0;
		for (int j=0; j<gene.nrClusters; j++) {
			sum += pow(u[j][i], M) * d[j][i];
		}
		numerator += sqrt(sum);
	}
	numerator /= nrCases;
	for (int i=0; i<nrCases; i++) {
		float denominator = 0;
		for (int j=0; j<gene.nrClusters; j++) {
			denominator += pow(u[j][i], M) * d[j][i];
		}
		w[i] = numerator / sqrt(denominator);
	}

	// Calculate objective function value
	for (int i=0; i<gene.nrClusters; i++) {
		for (int j=0; j<nrCases; j++) {
			result += w[j] * pow(u[i][j], M) * d[i][j];
		}
	}

	return (result);
}


/*
 * Evaluate each individual within the population `p' according to
 * some objective function, with respect to the data set `data'.
 */
void evaluate (Individual **p, int size, DataType data) {
	for (int i=0; i<size; i++) {
		(*p)[i].fitness = objectiveFunction(data, (*p)[i]);
	}
}


/*
 * Returns the length of vector `v'.
 */
inline float lengthVector (float v[], int size) {
	float sumOfSqrs = 0;

	for (int i=0; i<size; i++) {
		sumOfSqrs += square(v[i]);
	}
	return (sqrt(sumOfSqrs));
}


/*
 * Sort the elements of the vector `centroids' by their relative
 * distance to the origin in ascending order.
 */
void sortCentroids (float **centroids, int size) {
	float length[size];

	// Calculate the distance of each centroid to the origin
	for (int i=0; i<size; i++) {
		length[i] = lengthVector(centroids[i], nrAttributes);
	}

	// Sort cluster centroids with insertion sort
	for (int i=1; i<size; i++) {
		int j = i;
		float tmpLen = length[i];
		float *tmpCen = centroids[i];
		for ( ; j>0 && tmpLen < length[j-1]; j--) {
			length[j] = length[j-1];
			centroids[j] = centroids[j-1];
		}
		length[j] = tmpLen;
		centroids[j] = tmpCen;
	}
}


/*
 * Recombine two parent clusterings, pointed to by `p1' and `p2'.
 * The resulting `child' is constructed from the first part of
 * `p1' (up to centroid `cutOff1') and the last part of `p2'
 * (starting from centroid `cutOff2'). Note that the gene lenghts
 * of `p1', `p2', and `child' can differ.
 */
void makeChild (Individual *child, Individual *p1, Individual *p2,
                                          int cutOff1, int cutOff2) {
	int nrClusters = (cutOff1 + 1) + (p2->nrClusters - cutOff2);
	child->nrClusters = nrClusters;
	child->centroids = new float *[nrClusters];

	for (int i=0; i<=cutOff1; i++) {
		child->centroids[i] = new float[nrAttributes];
		for (int j=0; j<nrAttributes; j++) {
			child->centroids[i][j] = p1->centroids[i][j];
		}
	}
	for (int i=cutOff2; i<p2->nrClusters; i++) {
		child->centroids[++cutOff1] = new float[nrAttributes];
		for (int j=0; j<nrAttributes; j++) {
			child->centroids[cutOff1][j] = p2->centroids[i][j];
		}
	}
}


/*
 * Derive a new population of children of size `newSize' by
 * recombining individuals from the population pointed to by `p'.
 * De recombination is conducted by choosing two random individuals
 * from population `p' and recombining them with sorted 1-point
 * crossover. The resulting child population is returned.
 */
Individual *recombine (Individual **p, int oldSize, int newSize) {
	Individual *result = new Individual[newSize];

	// Sort centroids, so recombination is less disruptive
	for (int i=0; i<oldSize; i++) {
		sortCentroids((*p)[i].centroids, (*p)[i].nrClusters); 
	}
	
	// Apply random 1-point crossover
	for (int i=0; i<newSize; i+=2) {
		// Choose random parents
		Individual *p1 = &((*p)[randomInt(0, oldSize-1)]);
		Individual *p2 = &((*p)[randomInt(0, oldSize-1)]);

		// Choose cutoffs. 
		// p1 will be cut after cutOff1, p2 before cutOff2.
		int cutOff1, cutOff2;
		cutOff1 = randomInt(0, p1->nrClusters-1);
		float lengthCut1 = lengthVector(p1->centroids[cutOff1], nrAttributes);
		for (cutOff2 = 0; cutOff2 < p2->nrClusters; cutOff2++) {
			if (lengthVector(p2->centroids[cutOff2], nrAttributes) >= lengthCut1)
				break;
		}

		// Construct the children
		makeChild(&(result[i]), p1, p2, cutOff1, cutOff2);
		makeChild(&(result[i+1]), p2, p1, cutOff2-1, cutOff1+1);		
	}
	return (result);
}


/*
 * Mutates the individual pointed to by `p' by performing one
 * iteration of the K-Means algorithm with respect to the `data'.
 */
void mutateKmeans (Individual *p, DataType data) {
	int nr[p->nrClusters];
	float sum[p->nrClusters][nrAttributes];

	// Initialise the struct kmeans
	for (int i=0; i<p->nrClusters; i++) {
		nr[i] = 0;
		for (int j=0; j<nrAttributes; j++) {
			sum[i][j] = 0;
		}
	}

	// For each case, determine the centroid that lies nearest to it
	// and increment the kmeans-sum of that concerning centroid.
	for (int i=0; i<nrCases; i++) {
		int nearestCentroid = -1;
		float nearestDist = HUGE_VAL;
		for (int j=0; j<p->nrClusters; j++) {
			float currDist = distanceSquare(p->centroids[j],
			                                data.values[i], nrAttributes);
			if (currDist < nearestDist) {
				nearestDist = currDist;
				nearestCentroid = j;
			}
		}
		for (int j=0; j<nrAttributes; j++) {
			sum[nearestCentroid][j] += data.values[i][j];
		}
		nr[nearestCentroid]++;
	}
	
	// Determine the new centroid, which is the average of the cases
	// of which the concerning centroid is the nearest centroid.
	for (int i=0; i<p->nrClusters; i++) {
		if (nr[i] > 0) {
			for (int j=0; j<nrAttributes; j++) {
				p->centroids[i][j] = sum[i][j] / nr[i];
			}
		}
	}
}

/*
 * Returns the mutation rate after `time' units of time.
 */
float mutationRate (int time) {
	return 1 - 0.90 * ((float)time/MAX_GENERATIONS);
}


/*
 * Mutates the population pointed to by `p', according to some
 * mutation operator.
 */
void mutate (Individual **p, int size, DataType data) {
	for (int i=0; i<size; i++) {
		if (drand48() < mutationRate(generation)) {
			mutateKmeans(&((*p)[i]), data);
		}
	}
}


/*
 * Returns the union of the populations pointed to by `p1' and `p2'.
 */
Individual *unite (Individual **p1, int size1, Individual **p2, int size2) {
	Individual *result = new Individual[size1 + size2];

	for (int i=0; i<size1; i++) {
		result[i] = (*p1)[i];
	}
	for (int i=0; i<size2; i++) {
		result[i + size1] = (*p2)[i];
	}
	return (result);
}



/*
 * Sorts individuals within the population pointed to by `p' by
 * their relative fitness in ascending order. The precondition of
 * this function is that de fitness of the individuals is set
 * (by calling the function `evaluate').
 */
void sortIndividuals (Individual **p, int size) {
	// Sort individuals with shellsort, based on their fitness.
	for (int gap=size/2; gap>0; gap = gap == 2 ? 1 : (int) (gap/2.2)) {
		for (int i=gap; i<size; i++) {
			Individual tmp = (*p)[i];
			int j = i;
			for ( ; j>=gap && tmp.fitness<(*p)[j-gap].fitness; j-=gap) {
				(*p)[j] = (*p)[j-gap];
			}
			(*p)[j] = tmp;
		}
	}
}


/*
 * Returns `true' if the population pointed to by `p' contains
 * a pointer to individual `ind', and `false' if not.
 */
int contains (Individual ind, Individual *p, int size) {
	for (int i=0; i<size; i++) {
		if (&ind == &(p[i])) {
			return (1);
		}
	}
	return (0);
}


/*
 * Returns a selection of individuals from the population pointed
 * to by `p' according to some selection operator. In total, a
 * number of `newSize' individuals are selected.
 */
Individual *select (Individual **p, int oldSize, int newSize) {
	Individual *result = new Individual[newSize];

	sortIndividuals(p, oldSize);
	result[0] = (*p)[0];
	for (int i=1; i<newSize; i++) {
		result[i] = (*p)[i];
	}
	return (result);
}


/*
 * Prints the clustering result to standard output, based on
 * the winning individual `gene':
 *  - For each datum in the dataset, the membership degree to
 *    each cluster is printed
 *  - For each cluster in the resulting clustering, the data
 *    belonging to that cluster is printed.
 *  - Each datum considered an outlier is printed.
 *  - The average profile of each cluster within the resulting
 *    clustering is printed.
 */
void printResults (DataType data, Individual gene) {
	Individual *p = new Individual[1];
	float w[nrCases]; // weigths
	float d[gene.nrClusters][nrCases]; // squared distances
	float u[gene.nrClusters][nrCases]; // membership degrees
	
	// Calculate the squared distances
	for (int i=0; i<gene.nrClusters; i++) {
		for (int j=0; j<nrCases; j++) {
			d[i][j] = distanceSquare(gene.centroids[i],
			                         data.values[j], nrAttributes);
		}
	}
	
	// Calculate membership degrees
	for (int i=0; i<gene.nrClusters; i++) {
		for (int j=0; j<nrCases; j++) {
			float sum = 0;
			for (int k=0; k<gene.nrClusters; k++) {
				sum += pow(d[i][j]/d[k][j], (float) 1.0/(M-1));
			}
			u[i][j] = 1/sum;
		}
	}

	// Calculate weigths
	float numerator = 0;
	for (int i=0; i<nrCases; i++) {
		float sum = 0;
		for (int j=0; j<gene.nrClusters; j++) {
			sum += pow(u[j][i], M) * d[j][i];
		}
		numerator += sqrt(sum);
	}
	numerator /= nrCases;
	for (int i=0; i<nrCases; i++) {
		float denominator = 0;
		for (int j=0; j<gene.nrClusters; j++) {
			denominator += pow(u[j][i], M) * d[j][i];
		}
		w[i] = numerator / sqrt(denominator);
	}

	// Make crisp
	int belong[nrCases];
	for (int i=0; i<nrCases; i++) {
		int nearest = 0;
		for (int j=1; j<gene.nrClusters; j++) {
			if (u[j][i] > u[nearest][i]) {
				nearest = j;
			}
		}
		belong[i] = nearest;
	}

	// Print results
	cout << "\n===> INFORMATION PER SAMPLE\n\n";
	cout << "              Sample\tWeight\tCluster\tMembership degrees\n";
	for (int i=0; i<nrCases; i++) {
		printf("%20s\t%.3f\t%d", data.labels[i], w[i], belong[i] + 1);
		for (int j=0; j<gene.nrClusters; j++) {
			printf("\t%0.3f", u[j][i]);
		}
		cout << '\n';
	}
	cout << "\n===> CLUSTERING\n\n";
	for (int i=0; i<gene.nrClusters; i++) {
		cout << "\tCLUSTER " << i + 1 << ":\n";
		cout << "              Sample\tWeight\tMem.deg.\n";
		for (int j=0; j<nrCases; j++) {
			if (belong[j] == i) {
				printf("%20s\t%.3f\t%.3f\n", data.labels[j], w[j], u[i][j]);
			}
		}
		cout << '\n';
	}
	cout << "===> OUTLIERS (weight <= 0.75)\n\n";
	bool noneFound = true;
	for (int i=0; i<nrCases; i++) {
		if (w[i] <= 0.75) {
			if (noneFound) {
				cout << "              Sample\tWeight\tCluster\tMem.deg.\n";
				noneFound = false;
			}
			printf("%20s\t%.3f\t%d\t%.3f\n", data.labels[i], w[i], belong[i] + 1, u[belong[i]][i]);
		}
	}
	if (noneFound) { // if no outliers found
		cout << "\t<none>\n";
	}
	cout << '\n';
}


/*
 * Main routine of the Dynamic Evolutionary Fuzzy Clustering
 * algorithm.
 */
int main (int argc, char *argv[]) {
	char *fileName;
	ifstream dataFile;
	Individual *offspring;
	Individual best, worst;
	
	// Check arguments
	if (argc != 4) {
		printUsage();
		return (1);
	}
	
	if (strncmp(argv[1], "e", 1) == 0) {
		distanceMetric = EUCLIDEAN_DIST;
	} else if (strncmp(argv[1], "p", 1) == 0) {
		distanceMetric = PEARSON_DIST;
	} else if (strncmp(argv[1], "s", 1) == 0) {
		distanceMetric = SPEARMAN_DIST;
	} else if (strncmp(argv[1], "a", 1) == 0) {
		distanceMetric = AGREEMENT_DIST;
	} else if (strncmp(argv[1], "c", 1) == 0) {
		distanceMetric = CONCORDANCE_DIST;
	} else {
		cerr << "ERROR: illegal value for distance metric.\n";
		printUsage();
		exit(1);
	}
	
	sscanf(argv[2], "%d", &nrClustersFixed);
	fileName = argv[3];
	
	// Read the data
	nrAttributes = readDataFile(fileName, &data);
	
	// Preprocessing
	nrCases = removeFlatProfiles(&data, nrCases);
	
	// Set random seed
	time_t t = time(NULL);
	srand48(t);
	
	// Reproduction cycle
	cout << "\n===> RUNNING\n\n";
	initialize(&population, POPULATION_SZ);
	removeDuplicateCentroids(&population, POPULATION_SZ);
	evaluate(&population, POPULATION_SZ, data);
	generation = 0;
	do {
		cout << "\tgeneration " << generation + 1 << "\n";
		generation++;
		offspring = recombine(&population, POPULATION_SZ, OFFSPRING_SZ);
		mutate(&offspring, OFFSPRING_SZ, data);
		removeDuplicateCentroids(&offspring, OFFSPRING_SZ);
		evaluate(&offspring, OFFSPRING_SZ, data);
		population = unite(&population, POPULATION_SZ, &offspring, OFFSPRING_SZ);
		population = select(&population, POPULATION_SZ+OFFSPRING_SZ, POPULATION_SZ);
		best = population[0];
		worst = population[POPULATION_SZ-1];
	} while (worst.fitness - best.fitness > SMALL && generation < MAX_GENERATIONS);
	
	// Print results
	printResults(data, best);
}
