Introduction
Density-Based Spatial Clustering of Applications with Noise (or DBSCAN) is an algorithm used in cluster analysis which is described in this Wikipedia article (
http://en.wikipedia.org/wiki/DBSCAN).
The basic idea of cluster analysis is to partition a set of points into clusters which have some relationship to each other. In the case of DBSCAN the user chooses the minimum number of points required to form a cluster and the maximum distance between points in each cluster. Each point is then considered in turn, along with its neighbours, and allocated to a cluster.
I was recently asked if I could implement this algorithm in C# as there appears to be no other implementation which is freely available.
The source code is listed below in case it is of interest to anyone else involved in this specialized field.
Source Code
using System;
using
System.Collections.Generic;
using System.Linq;
class Point
{
public
const int NOISE
= -1;
public
const int
UNCLASSIFIED = 0;
public
int X, Y, ClusterId;
public
Point(int x, int
y)
{
this.X = x;
this.Y = y;
}
public
override string
ToString()
{
return String.Format("({0}, {1})", X, Y);
}
public
static int
DistanceSquared(Point p1, Point p2)
{
int diffX = p2.X - p1.X;
int diffY = p2.Y - p1.Y;
return diffX * diffX + diffY * diffY;
}
}
static class DBSCAN
{
static
void Main()
{
List<Point> points = new List<Point>();
// sample data
points.Add(new Point(0,
100));
points.Add(new Point(0,
200));
points.Add(new Point(0,
275));
points.Add(new Point(100,
150));
points.Add(new Point(200,
100));
points.Add(new Point(250,
200));
points.Add(new Point(0,
300));
points.Add(new Point(100,
200));
points.Add(new Point(600,
700));
points.Add(new Point(650,
700));
points.Add(new Point(675,
700));
points.Add(new Point(675, 710));
points.Add(new Point(675,
720));
points.Add(new Point(50,
400));
double eps = 100.0;
int minPts = 3;
List<List<Point>> clusters = GetClusters(points, eps,
minPts);
Console.Clear();
// print points to console
Console.WriteLine("The {0}
points are :\n", points.Count);
foreach (Point p in points) Console.Write(" {0} ", p);
Console.WriteLine();
// print clusters to console
int total = 0;
for (int i = 0; i <
clusters.Count; i++)
{
int count = clusters[i].Count;
total += count;
string plural = (count != 1) ? "s" : "";
Console.WriteLine("\nCluster
{0} consists of the following {1} point{2} :\n", i + 1, count,
plural);
foreach (Point p in clusters[i]) Console.Write(" {0} ", p);
Console.WriteLine();
}
// print any points which are NOISE
total = points.Count -
total;
if (total > 0)
{
string
plural = (total != 1) ? "s" : "";
string verb = (total != 1) ? "are"
: "is";
Console.WriteLine("\nThe
following {0} point{1} {2} NOISE :\n", total, plural, verb);
foreach (Point p in points)
{
if (p.ClusterId == Point.NOISE)
Console.Write("
{0} ", p);
}
Console.WriteLine();
}
else
{
Console.WriteLine("\nNo
points are NOISE");
}
Console.ReadKey();
}
static
List<List<Point>> GetClusters(List<Point> points, double
eps, int minPts)
{
if (points == null) return null;
List<List<Point>> clusters = new
List<List<Point>>();
eps *= eps; // square eps
int clusterId = 1;
for (int i = 0; i <
points.Count; i++)
{
Point p = points[i];
if (p.ClusterId == Point.UNCLASSIFIED)
{
if (ExpandCluster(points, p, clusterId, eps, minPts))
clusterId++;
}
}
// sort out points into their clusters, if any
int maxClusterId = points.OrderBy(p =>
p.ClusterId).Last().ClusterId;
if (maxClusterId < 1) return
clusters; // no clusters, so list is empty
for (int i = 0; i <
maxClusterId; i++) clusters.Add(new List<Point>());
foreach (Point p in points)
{
if (p.ClusterId > 0) clusters[p.ClusterId -
1].Add(p);
}
return clusters;
}
static
List<Point>
GetRegion(List<Point>
points, Point p, double
eps)
{
List<Point> region = new List<Point>();
for (int i = 0; i <
points.Count; i++)
{
int distSquared = Point.DistanceSquared(p,
points[i]);
if (distSquared <= eps) region.Add(points[i]);
}
return region;
}
static
bool ExpandCluster(List<Point> points, Point
p, int clusterId, double
eps, int minPts)
{
List<Point> seeds =
GetRegion(points, p, eps);
if (seeds.Count < minPts) // no core
point
{
p.ClusterId = Point.NOISE;
return false;
}
else // all points in seeds are density
reachable from point 'p'
{
for (int i = 0; i
< seeds.Count; i++) seeds[i].ClusterId = clusterId;
seeds.Remove(p);
while (seeds.Count > 0)
{
Point currentP = seeds[0];
List<Point>
result = GetRegion(points, currentP, eps);
if (result.Count >= minPts)
{
for (int i = 0; i
< result.Count; i++)
{
Point resultP = result[i];
if (resultP.ClusterId == Point.UNCLASSIFIED
|| resultP.ClusterId == Point.NOISE)
{
if (resultP.ClusterId == Point.UNCLASSIFIED)
seeds.Add(resultP);
resultP.ClusterId = clusterId;
}
}
}
seeds.Remove(currentP);
}
return true;
}
}
}
Test Results
The results of running this code should be:
The 14 points are :
(0, 100) (0, 200) (0, 275) (100, 150) (200, 100) (250, 200) (0, 300) (100, 200)
(600, 700) (650, 700) (675, 700) (675, 710) (675, 720) (50, 400)
Cluster 1 consists of the following 6 points :
(0, 100) (0, 200) (0, 275) (100, 150) (0, 300) (100, 200)
Cluster 2 consists of the following 5 points :
(600, 700) (650, 700) (675, 700) (675, 710) (675, 720)
The following 3 points are NOISE :
(200, 100) (250, 200) (50, 400)