diff options
author | Andrea Bastoni <bastoni@cs.unc.edu> | 2010-04-12 23:23:50 -0400 |
---|---|---|
committer | Andrea Bastoni <bastoni@cs.unc.edu> | 2010-04-12 23:23:50 -0400 |
commit | 3b21f9d15822453117d1e908ab97cacd8f7f39be (patch) | |
tree | 09b0c519d11975fe5e1d364ff3174fa358407177 /data_analysis/statanalyzer.py | |
parent | 27a6355f1d25bdf8a68bea6e19cc283a943753da (diff) |
Add IQR outliers removal
- this code is ugly C style code "transformed" in Python
should be reworked at some point...
Diffstat (limited to 'data_analysis/statanalyzer.py')
-rw-r--r-- | data_analysis/statanalyzer.py | 56 |
1 files changed, 56 insertions, 0 deletions
diff --git a/data_analysis/statanalyzer.py b/data_analysis/statanalyzer.py new file mode 100644 index 0000000..6e096c4 --- /dev/null +++ b/data_analysis/statanalyzer.py | |||
@@ -0,0 +1,56 @@ | |||
1 | #!/usr/bin/env python | ||
2 | |||
3 | import numpy as np | ||
4 | from scipy import stats | ||
5 | |||
6 | class InterQuartileRange: | ||
7 | def __init__(self, low, high, extend = False): | ||
8 | self.low = low | ||
9 | self.high = high | ||
10 | # extend is 1.5 extension of IQR | ||
11 | self.extend = extend | ||
12 | |||
13 | def remOutliers(self, vector): | ||
14 | svect = np.sort(vector) | ||
15 | q1 = stats.scoreatpercentile(svect, self.low) | ||
16 | q3 = stats.scoreatpercentile(svect, self.high) | ||
17 | |||
18 | # match the values \in svect which are closer to q[1|3] | ||
19 | # (q1, q3) | ||
20 | q1_pos = -1 | ||
21 | q3_pos = -1 | ||
22 | cur_pos = 0 | ||
23 | for i in svect: | ||
24 | if q1_pos != -1 and q3_pos != -1: | ||
25 | break | ||
26 | if q1_pos == -1 and i > q1: | ||
27 | q1_pos = cur_pos | ||
28 | if q3_pos == -1 and q3 < i: | ||
29 | q3_pos = cur_pos | ||
30 | |||
31 | cur_pos += 1 | ||
32 | |||
33 | if self.extend == True: | ||
34 | # 1.5 IQR outliers elimination | ||
35 | eiqr = (svect[q3_pos] - svect[q1_pos]) * 1.5 | ||
36 | eq1 = svect[q1_pos] - eiqr | ||
37 | if eq1 < svect[0]: | ||
38 | eq1 = svect[0] | ||
39 | eq3 = svect[q3_pos] + eiqr | ||
40 | if eq3 > svect[len(svect) - 1]: | ||
41 | eq3 = svect[len(svect) - 1] | ||
42 | # match the values \in svect which are closer to eq[1|3] | ||
43 | q1_pos = -1 | ||
44 | q3_pos = -1 | ||
45 | cur_pos = 0 | ||
46 | for i in svect: | ||
47 | if q1_pos != -1 and q3_pos != -1: | ||
48 | break | ||
49 | if q1_pos == -1 and i > eq1: | ||
50 | q1_pos = cur_pos | ||
51 | if q3_pos == -1 and eq3 < i: | ||
52 | q3_pos = cur_pos | ||
53 | |||
54 | cur_pos += 1 | ||
55 | |||
56 | return svect[q1_pos : q3_pos] | ||