src/lib/JenksBreaks.h
/**************************************************************************************
* File name: JenksBreaks.h
*
* Project: MapWindow Open Source (MapWinGis ActiveX control)
* Description: Declaration of JenksBreaks.h
*
**************************************************************************************
* The contents of this file are subject to the Mozilla Public License Version 1.1
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at http://www.mozilla.org/mpl/
* See the License for the specific language governing rights and limitations
* under the License.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
**************************************************************************************
* Contributor(s):
* (Open source contributors should list themselves and their modifications here). */
// Sergei Leschinsky (lsu) 25 june 2010 - created the file
#pragma once
#include <vector>
#include <algorithm>
#include "float.h"
#ifndef MAX
# define MIN(a,b) ((a<b) ? a : b)
# define MAX(a,b) ((a>b) ? a : b)
#endif
class CJenksBreaks
{
// data structures
struct JenksData
{
public:
double value;
double square;
int classId; // number of break to which a value belongs
};
struct JenksBreak
{
public:
double value;
double square;
int count;
double SDev;
int startId;
int endId;
JenksBreak()
{
value = 0.0;
square = 0.0;
count = 0;
SDev = 0.0;
startId = -1;
endId = -1;
}
// the formula for every class is: sum((a[k])^2) - sum(a[k])^2/n
void RefreshStandardDeviations()
{
SDev = square - pow(value, 2.0)/(double)(endId - startId + 1);
}
};
public:
// Constructor
CJenksBreaks(std::vector<double>* values, int numClasses)
{
if (numClasses > 0)
{
_numClasses = numClasses;
_numValues = values->size();
double classCount = (double)_numValues/(double)numClasses;
sort(values->begin(), values->end()); //values sould be sorted
// fill values
for (int i = 0; i < _numValues; i++)
{
JenksData data;
data.value = (*values)[i];
data.square = pow((*values)[i], 2.0);
data.classId = int(floor(i/classCount));
_values.push_back(data);
}
_classes.resize(_numClasses);
// calculate initial deviations for classes
int lastId = -1;
for (int i = 0; i < _numValues; i++)
{
int classId = _values[i].classId;
if (classId >= 0 && classId < _numClasses)
{
_classes[classId].value += _values[i].value;
_classes[classId].square += _values[i].square;
_classes[classId].count += 1;
// saving bound between classes
if (classId != lastId)
{
_classes[classId].startId = i;
lastId = classId;
if (classId > 0)
_classes[classId - 1].endId = i - 1;
}
}
else
{
// TODO: add error handling
}
}
_classes[_numClasses - 1].endId = _numValues - 1;
for (int i = 0; i < _numClasses; i++)
_classes[i].RefreshStandardDeviations();
}
}
~CJenksBreaks(){}
std::vector<int>* TestIt(std::vector<double>* data, int numClasses)
{
int numValues = data->size();
//std::vector<std::vector<int>> mat1;
//std::vector<std::vector<float>> mat2;
//mat1.resize(numValues + 1);
//mat2.resize(numValues + 1);
//for (int i = 0; i <= numValues; i++)
//{
// mat1[i].resize(numClasses + 1);
// mat2[i].resize(numClasses + 1, FLT_MAX);
//}
//for (int j = 1; j < numClasses + 1; j++)
//{
// mat1[1][j] = 1;
// mat2[1][j] = 0.0;
//}
//
//double s1,s2,w,SSD;
//SSD = 0.0;
//for(int l = 2; l <= numValues; l++) //arrays are 0 based, but we skip 1
// {
// s1=s2=w=0;
// for(int m = 1; m <= l; m++)
// {
// int i3 = l - m + 1;
// double val = (*data)[i3 - 1];
// s2 += val * val;
// s1 += val;
// w++;
// SSD = s2 - (s1 * s1) / w;
// int i4 = i3 - 1;
// if(i4 != 0 )
// {
// for(int j = 2; j <= numClasses; j++)
// {
// double newVal = mat2[i4][j - 1] + SSD;
// double oldValue = mat2[l][j];
// if(newVal <= oldValue) // if new class is better than previous than let's write it
// {
// mat1[l][j] = i3;
// mat2[l][j] = mat2[i4][j - 1] + SSD;
// }
// }
// }
// }
// mat1[l][0] = 0;
// mat2[l][0] =SSD;
//}
//
std::vector<int>* result = new std::vector<int>;
result->resize(numClasses + 1);
(*result)[numClasses] = numValues;
//
//int k = numValues;
// for(int j = result->size() - 1; j >= 1; j--)
// {
// int id = mat1[k][j] - 1;
// (*result)[j - 1] = id;
// k = id;
// }
// ESRI breaks
/*(*result)[0] = 0;
(*result)[1] = 387;
(*result)[2] = 1261;
(*result)[3] = 2132;
(*result)[4] = 2698;
(*result)[5] = 2890;
(*result)[6] = 2996;
(*result)[7] = 3064;
(*result)[8] = 3093;
(*result)[9] = 3107;*/
/*for (int i = 1; i <= numClasses; i++)
{
(*result)[i] = numValues/numClasses * i;
}*/
double step = ((*data)[numValues - 1] - (*data)[0])/(numClasses);
int cnt = 0;
for (int i = 0; i < numValues; i++)
{
if ((*data)[i] > step * cnt)
{
cnt++;
if (cnt > numClasses) break;
(*result)[cnt] = i;
}
}
double s1,s2,w,SSD;
SSD = 0;
for (int i = 1; i< numClasses + 1; i++)
{
int low, high;
if ( i == 1) low = 0;
else low = (*result)[i-1];
if ( i == numClasses) high = numValues;
else high = (*result)[i] -1;
s2 = s1 = w = 0;
for (int j = low; j < high; j++)
{
double val = (*data)[j];
s2 += val * val;
s1 += val;
w++;
}
if (w != 0.0)
SSD += s2 - (s1 * s1) / w;
}
//// cleaning
//for (int i = 0; i < numValues; i++)
//{
// delete[] mat1[i];
// delete[] mat2[i];
//}
//delete[] mat1;
//delete[] mat2;
return result;
}
// -------------------------------------------------------------------
// Optimization routine
// -------------------------------------------------------------------
void Optimize()
{
// initialization
double minValue = get_SumStandardDeviations(); // current best minimum
_leftBound = 0; // we'll consider all classes in the beginning
_rightBound = _classes.size() - 1;
_previousMaxId = -1;
_previousTargetId = - 1;
int numAttemmpts = 0;
bool proceed = true;
while (proceed)
{
for (int i = 0; i < _numValues; i++)
{
FindShift();
// when there are only 2 classes left we should stop on the local max value
if (_rightBound - _leftBound == 0)
{
double val = get_SumStandardDeviations(); // the final minimum
numAttemmpts++;
if ( numAttemmpts > 5)
{
return;
}
}
}
double value = get_SumStandardDeviations();
proceed = (value < minValue)?true:false; // if the deviations became smaller we'll execute one more loop
if (value < minValue)
minValue = value;
}
}
// -------------------------------------------------------------------
// Returning of results (indices of values to start each class)
// -------------------------------------------------------------------
std::vector<long>* get_Results()
{
std::vector<long>* results = new std::vector<long>;
results->resize(_numClasses);
for (int i = 0; i < _numClasses; i++ )
{
(*results)[i] = _classes[i].startId;
}
return results;
}
private:
// data members
std::vector<JenksData> _values;
std::vector<JenksBreak> _classes;
int _numClasses;
int _numValues;
int _previousMaxId; // to prevent stalling in the local optimum
int _previousTargetId;
int _leftBound; // the id of classes between which optimization takes place
int _rightBound; // initially it's all classes, then it's reducing
// ******************************************************************
// Calculates the sum of standard deviations of individual variants
// from the class mean through all class
// It's the objective function - should be minimized
//
// ******************************************************************
double get_SumStandardDeviations()
{
double sum = 0.0;
for (int i = 0; i < _numClasses; i++)
{
sum += _classes[i].SDev;
}
return sum;
}
// ******************************************************************
// MakeShift()
// ******************************************************************
// Passing the value from one class to another to another. Criteria - standard deviation.
void FindShift()
{
// first we'll find classes with the smallest and largest SD
int maxId = 0, minId = 0;
double minValue = 100000000000.0, maxValue = 0.0; // use constant
for (int i = _leftBound; i <= _rightBound; i++)
{
if (_classes[i].SDev > maxValue)
{
maxValue = _classes[i].SDev;
maxId = i;
}
if (_classes[i].SDev < minValue)
{
minValue = _classes[i].SDev;
minId = i;
}
}
// then pass one observation from the max class in the direction of min class
int valueId = -1;
int targetId = -1;
if (maxId > minId)
{
//<- we should find first value of max class
valueId = _classes[maxId].startId;
targetId = maxId - 1;
_classes[maxId].startId++;
_classes[targetId].endId++;
}
else if (maxId < minId)
{
//-> we should find last value of max class
valueId = _classes[maxId].endId;
targetId = maxId + 1;
_classes[maxId].endId--;
_classes[targetId].startId--;
}
else
{
// only one class left or the deviations withinb classes are equal
return;
}
// Prevents stumbling in local optimum - algorithm will be repeating the same move
// To prevent this we'll exclude part of classes with less standard deviation
if (_previousMaxId == targetId && _previousTargetId == maxId)
{
// Now we choose which of the two states provides less deviation
double value1 = get_SumStandardDeviations();
// change to second state
MakeShift(maxId, targetId, valueId);
double value2 = get_SumStandardDeviations();
// if first state is better revert to it
if (value1 < value2)
{
MakeShift(targetId, maxId, valueId);
}
// now we can exclude part of the classes where no improvements can be expected
int min = MIN(targetId, maxId);
int max = MAX(targetId, maxId);
double avg = get_SumStandardDeviations()/(_rightBound - _leftBound + 1);
// analyze left side of distribution
double sumLeft = 0, sumRight = 0;
for (int j = _leftBound; j <= min; j++)
{
sumLeft += pow(_classes[j].SDev - avg, 2.0);
}
sumLeft /= (min - _leftBound + 1);
// analyze right side of distribution
for (int j = _rightBound; j >= max; j--)
{
sumRight += pow(_classes[j].SDev - avg, 2.0);
}
sumRight /= (_rightBound - max + 1);
// exluding left part
if (sumLeft >= sumRight)
{
_leftBound = max;
}
// exluding right part
else if (sumLeft < sumRight)
{
_rightBound = min;
}
}
else
{
MakeShift(maxId, targetId, valueId);
}
}
// perform actual shift
void MakeShift(int maxId, int targetId, int valueId)
{
// saving the last shift
_previousMaxId = maxId;
_previousTargetId = targetId;
JenksData* data = &(_values[valueId]);
// removing from max class
_classes[maxId].value -= data->value;
_classes[maxId].square -= data->square;
_classes[maxId].count -= 1;
_classes[maxId].RefreshStandardDeviations();
// passing to target class
_classes[targetId].value += data->value;
_classes[targetId].square += data->square;
_classes[targetId].count += 1;
_classes[targetId].RefreshStandardDeviations();
// mark that the value was passed
_values[valueId].classId = targetId;
}
};