Skip to content

Commit 72163ae

Browse files
author
Andrei Scheinkman
committed
cleaned up bob ross clustering script
1 parent 346beab commit 72163ae

File tree

1 file changed

+56
-23
lines changed

1 file changed

+56
-23
lines changed

bob-ross/cluster-paintings.py

+56-23
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,59 @@
1-
from numpy import array
1+
"""
2+
Clusters Bob Ross paintings by features.
3+
4+
By Walter Hickey <[email protected]>
5+
6+
See http://fivethirtyeight.com/features/a-statistical-analysis-of-the-work-of-bob-ross/
7+
"""
8+
9+
import numpy as np
210
from scipy.cluster.vq import vq, kmeans, whiten
311
import math
12+
import csv
13+
14+
def main():
15+
16+
# load data into vectors of 1s and 0s for each tag
17+
with open('elements-by-episode.csv','r') as csvfile:
18+
reader = csv.reader(csvfile)
19+
reader.next() # skip header
20+
data = []
21+
for row in reader:
22+
data.append(map(lambda x: int(x), row[2:])) # exclude EPISODE and TITLE columns
23+
24+
# convert to numpy matrix
25+
matrix = np.array(data)
26+
27+
# remove colums that have been tagged less than 5 times
28+
columns_to_remove = []
29+
for col in range(np.shape(matrix)[1]):
30+
if sum(matrix[:,col]) <= 5:
31+
columns_to_remove.append(col)
32+
matrix = np.delete(matrix, columns_to_remove, axis=1)
33+
34+
# normalize according to stddev
35+
whitened = whiten(matrix)
36+
output = kmeans(whitened, 10)
37+
38+
print "episode", "distance", "cluster"
39+
40+
# determine distance between each of 403 vectors and each centroid, find closest neighbor
41+
for i, v in enumerate(whitened):
42+
43+
# distance between centroid 0 and feature vector
44+
distance = math.sqrt(sum((v - output[0][0]) ** 2))
45+
46+
# group is the centroid it is closest to so far, set initally to centroid 0
47+
group = 0
48+
closest_match = (distance, group)
49+
50+
# test the vector i against the 10 centroids, find nearest neighbor
51+
for x in range (0, 10):
52+
dist_x = math.sqrt(sum((v - output[0][x]) ** 2))
53+
if dist_x < closest_match[0]:
54+
closest_match = (dist_x, x)
55+
56+
print i+1, closest_match[0], closest_match[1]
457

5-
# TK: Load data from file as array and assign to bobross
6-
7-
# Normalizes according to st.dev.
8-
whitened = whiten(bobross)
9-
output = kmeans(whitened,10)
10-
print output
11-
12-
# Determines distance between each of 403 vectors and each centroid, and finds closest neighbor
13-
for i in range(0,403):
14-
print i+1
15-
# Dist between centroid 0 and vector
16-
distance = math.sqrt(sum((whitened[i] - output[0][0]) ** 2))
17-
# Group is the centroid it is closest to so far, set initally to centroid 0
18-
group = 0
19-
# Combo combines distance and group into a single entity
20-
combo = (distance, group)
21-
# Tests the vector i against the 10 centroids, finds nearest neighbor:
22-
for x in range (0,10):
23-
distance_temp = math.sqrt(sum((whitened[i] - output[0][x]) ** 2))
24-
if distance_temp < combo[0]:
25-
combo = (distance_temp,x)
26-
print combo
58+
if __name__ == "__main__":
59+
main()

0 commit comments

Comments
 (0)