Introduction
Quality Threshold (or QT) is an algorithm used in cluster analysis which is
described in this Wikipedia article (http://en.wikipedia.org/wiki/Cluster_analysis).
The basic idea of cluster analysis is to partition a set of points into clusters
which have some relationship to each other. In the case of QT the user chooses
the maximum diameter (i.e. distance between points) that a cluster can have and
each point is then considered in turn, along with its neighbors, and allocated
to a cluster.
I was recently asked if I could implement this algorithm in C# as there appears
to be no other implementation which is freely available.
The source code is listed below in case it is of interest to anyone else
involved in this specialized field.
Source code including sample data
using
System;
using
System.Collections.Generic;
struct
Point
{
public int
X, Y;
public Point(int
x, int y)
{
this.X = x;
this.Y = y;
}
public override
string ToString()
{
return
String.Format("({0}, {1})", X, Y);
}
public static
int DistanceSquared(Point
p1, Point p2)
{
int diffX = p2.X - p1.X;
int diffY = p2.Y - p1.Y;
return diffX * diffX + diffY * diffY;
}
}
static
class QT
{
static void
Main()
{
List<Point>
points = new List<Point>();
// sample data
points.Add(new
Point(0,100));
points.Add(new
Point(0,200));
points.Add(new
Point(0,275));
points.Add(new
Point(100, 150));
points.Add(new
Point(200,100));
points.Add(new
Point(250,200));
double maxDiameter = 150.0;
List<List<Point>>
clusters = GetClusters(points, maxDiameter);
Console.Clear();
// print points to console
Console.WriteLine("The
{0} points are :\n", points.Count);
foreach(Point
p in points) Console.Write("
{0} ", p);
Console.WriteLine();
// print clusters to console
for(int
i = 0; i < clusters.Count; i++)
{
int count = clusters[i].Count;
string plural = (count != 1) ?
"s" : "";
Console.WriteLine("\nCluster
{0} consists of the following {1} point{2} :\n", i + 1, count, plural);
foreach(Point
p in clusters[i])
Console.Write(" {0} ", p);
Console.WriteLine();
}
Console.ReadKey();
}
static List<List<Point>>
GetClusters(List<Point>
points, double maxDiameter)
{
if (points ==
null) return
null;
points = new
List<Point>(points);
// leave original List unaltered
List<List<Point>>
clusters = new List<List<Point>>();
while (points.Count > 0)
{
List<Point>
bestCandidate = GetBestCandidate(points, maxDiameter);
clusters.Add(bestCandidate);
}
return clusters;
}
/*
GetBestCandidate() returns first candidate cluster encountered if there
is more than one
with the maximum number of points.
*/
static List<Point>
GetBestCandidate(List<Point> points,
double maxDiameter)
{
double maxDiameterSquared = maxDiameter
* maxDiameter; // square maximum diameter
List<List<Point>>
candidates = new
List<List<Point>>();
// stores all candidate clusters
List<Point>
currentCandidate = null;
// stores current candidate cluster
int[] candidateNumbers =
new int[points.Count];
// keeps track of candidate numbers to which points
have been allocated
int totalPointsAllocated = 0;
// total number of points already allocated to
candidates
int currentCandidateNumber = 0;
// current candidate number
for(int i = 0; i
< points.Count; i++)
{
if (totalPointsAllocated ==
points.Count) break;
// no need to continue further
if (candidateNumbers[i] > 0)
continue; // point
already allocated to a candidate
currentCandidateNumber++;
currentCandidate = new
List<Point>();
// create a new candidate cluster
currentCandidate.Add(points[i]); //
add the current point to it
candidateNumbers[i] = currentCandidateNumber;
totalPointsAllocated++;
Point latestPoint = points[i];
// latest point added to current cluster
int[] diametersSquared =
new int[points.Count];
// diameters squared of each
point when added to current cluster
// iterate through any remaining points
// successively selecting the point
closest to the group until the threshold is exceeded
while (true)
{
if (totalPointsAllocated ==
points.Count) break;
// no need to continue further
int closest = -1;
// index of closest point to current candidate cluster
int minDiameterSquared =
Int32.MaxValue;
// minimum diameter squared,
initialized to impossible value
for (int j = i +
1; j < points.Count; j++)
{
if(candidateNumbers[j] > 0)
continue;
// point already allocated to a candidate
// update diameters squared to
allow for latest point added to current cluster
int distSquared =
Point.DistanceSquared(latestPoint, points[j]);
if (distSquared >
diametersSquared[j]) diametersSquared[j] = distSquared;
// check if closer than previous
closest point
if (diametersSquared[j]
< minDiameterSquared)
{
minDiameterSquared = diametersSquared[j];
closest = j;
}
}
// if closest point is within
maxDiameter, add it to the current candidate and mark it accordingly
if ((double)minDiameterSquared
<= maxDiameterSquared)
{
currentCandidate.Add(points[closest]);
candidateNumbers[closest] = currentCandidateNumber;
totalPointsAllocated++;
}
else
// otherwise finished with current candidate
{
break;
}
}
// add current candidate to candidates list
candidates.Add(currentCandidate);
}
// now find the candidate cluster with the largest
number of points
int maxPoints = -1;
// impossibly small value
int bestCandidateNumber = 0;
// ditto
for(int
i = 0; i < candidates.Count; i++)
{
if (candidates[i].Count > maxPoints)
{
maxPoints = candidates[i].Count;
bestCandidateNumber = i + 1; //
counting from 1 rather than 0
}
}
// iterating backwards to avoid indexing problems,
remove points in best candidate from points list
// this will automatically be persisted
to caller as List<Point> is a reference type
for(int
i = candidateNumbers.Length - 1; i >= 0; i--)
{
if (candidateNumbers[i] ==
bestCandidateNumber) points.RemoveAt(i);
}
// return best candidate to caller
return
candidates[bestCandidateNumber - 1];
}
}
Test results
The results of running this code should be:
The 6 points are :
(0, 100) (0, 200) (0, 275) (100, 150) (200, 100) (250, 200)
Cluster 1 consists of the following 3 points :
(0, 100) (0, 200) (100, 150)
Cluster 2 consists of the following 2 points :
(200, 100) (250, 200)
Cluster 3 consists of the following 1 point :
(0, 275)