Skip to content

Commit d518951

Browse files
authored
Add hot spot analysis (#27)
* added hotspot analysis
1 parent 4fc960a commit d518951

File tree

4 files changed

+655
-54
lines changed

4 files changed

+655
-54
lines changed

xrspatial/focal.py

+358-3
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,204 @@
11
import numpy as np
2-
32
from xarray import DataArray
3+
from xrspatial.utils import ngjit, lnglat_to_meters
4+
from numba import prange
5+
import re
6+
import warnings
7+
8+
warnings.simplefilter('default')
9+
10+
DEFAULT_UNIT = 'meter'
11+
12+
13+
# TODO: Make convolution more generic with numba first-class functions.
14+
15+
16+
def is_number(s):
17+
try:
18+
float(s)
19+
return True
20+
except ValueError:
21+
return False
22+
23+
24+
# modified from https://stackoverflow.com/questions/3943752/the-dateutil-parser-parse-of-distance-strings
25+
class Distance(object):
26+
METER = 1
27+
FOOT = 0.3048
28+
KILOMETER = 1000
29+
MILE = 1609.344
30+
UNITS = {'meter': METER,
31+
'meters': METER,
32+
'm': METER,
33+
'feet': FOOT,
34+
'foot': FOOT,
35+
'ft': FOOT,
36+
'miles': MILE,
37+
'mls': MILE,
38+
'ml': MILE,
39+
'kilometer': KILOMETER,
40+
'kilometers': KILOMETER,
41+
'km': KILOMETER,
42+
}
43+
44+
def __init__(self, s):
45+
self.number, unit = self._get_distance_unit(s)
46+
self._convert(unit)
47+
48+
def _get_distance_unit(self, s):
49+
# spit string into numbers and text
50+
splits = [x for x in re.split(r'(-?\d*\.?\d+)', s) if x != '']
51+
if len(splits) not in [1, 2]:
52+
raise ValueError("Invalid distance.")
53+
54+
number = splits[0]
55+
unit = DEFAULT_UNIT
56+
if len(splits) == 1:
57+
warnings.warn('Raster distance unit not provided. '
58+
'Use meter as default.', Warning)
59+
elif len(splits) == 2:
60+
unit = splits[1]
61+
62+
unit = unit.lower()
63+
unit = unit.replace(' ', '')
64+
if unit not in self.UNITS:
65+
raise ValueError(
66+
"Invalid value.\n"
67+
"Distance unit should be one of the following: \n"
68+
"meter (meter, meters, m),\n"
69+
"kilometer (kilometer, kilometers, km),\n"
70+
"foot (foot, feet, ft),\n"
71+
"mile (mile, miles, ml, mls)")
72+
return number, unit
73+
74+
def _convert(self, unit):
75+
self.number = float(self.number)
76+
if self.UNITS[unit] != 1:
77+
self.number *= self.UNITS[unit]
78+
79+
@property
80+
def meters(self):
81+
return self.number
82+
83+
@meters.setter
84+
def meters(self, v):
85+
self.number = float(v)
86+
87+
@property
88+
def miles(self):
89+
return self.number / self.MILE
90+
91+
@miles.setter
92+
def miles(self, v):
93+
self.number = v
94+
self._convert('miles')
95+
96+
@property
97+
def feet(self):
98+
return self.number / self.FOOT
99+
100+
@feet.setter
101+
def feet(self, v):
102+
self.number = v
103+
self._convert('feet')
104+
105+
@property
106+
def kilometers(self):
107+
return self.number / self.KILOMETER
108+
109+
@kilometers.setter
110+
def kilometers(self, v):
111+
self.number = v
112+
self._convert('KILOMETER')
4113

5-
from xrspatial.utils import ngjit
6114

115+
def _calc_cell_size(raster):
116+
if 'unit' in raster.attrs:
117+
unit = raster.attrs['unit']
118+
else:
119+
unit = DEFAULT_UNIT
120+
warnings.warn('Raster distance unit not provided. '
121+
'Use meter as default.', Warning)
7122

8-
#TODO: Make convolution more generic with numba first-class functions.
123+
cell_size_x = 1
124+
cell_size_y = 1
125+
126+
# calculate cell size from input `raster`
127+
for dim in raster.dims:
128+
if (dim.lower().count('x')) > 0:
129+
# dimension of x-coordinates
130+
if len(raster[dim]) > 1:
131+
cell_size_x = raster[dim].values[1] - raster[dim].values[0]
132+
elif (dim.lower().count('y')) > 0:
133+
# dimension of y-coordinates
134+
if len(raster[dim]) > 1:
135+
cell_size_y = raster[dim].values[1] - raster[dim].values[0]
136+
137+
lon0, lon1, lat0, lat1 = None, None, None, None
138+
for dim in raster.dims:
139+
if (dim.lower().count('lon')) > 0:
140+
# dimension of x-coordinates
141+
if len(raster[dim]) > 1:
142+
lon0, lon1 = raster[dim].values[0], raster[dim].values[1]
143+
elif (dim.lower().count('lat')) > 0:
144+
# dimension of y-coordinates
145+
if len(raster[dim]) > 1:
146+
lat0, lat1 = raster[dim].values[0], raster[dim].values[1]
147+
148+
# convert lat-lon to meters
149+
if (lon0, lon1, lat0, lat1) != (None, None, None, None):
150+
mx0, my0 = lnglat_to_meters(lon0, lat0)
151+
mx1, my1 = lnglat_to_meters(lon1, lat1)
152+
cell_size_x = mx1 - mx0
153+
cell_size_y = my1 - my0
154+
unit = DEFAULT_UNIT
155+
156+
sx = Distance(str(cell_size_x) + unit)
157+
sy = Distance(str(cell_size_y) + unit)
158+
return sx, sy
159+
160+
161+
def _gen_ellipse_kernel(half_w, half_h):
162+
# x values of interest
163+
x = np.linspace(-half_w, half_w, 2 * half_w + 1)
164+
# y values of interest, as a "column" array
165+
y = np.linspace(-half_h, half_h, 2 * half_h + 1)[:, None]
166+
167+
# True for points inside the ellipse
168+
# (x / a)^2 + (y / b)^2 <= 1, avoid division to avoid rounding issue
169+
ellipse = (x * half_h) ** 2 + (y * half_w) ** 2 <= (half_w * half_h) ** 2
170+
171+
return ellipse.astype(float)
172+
173+
174+
class Kernel:
175+
def __init__(self, shape='circle', radius=10000):
176+
self.shape = shape
177+
self.radius = radius
178+
self._validate_shape()
179+
self._validate_radius()
180+
181+
def _validate_shape(self):
182+
# validate shape
183+
if self.shape not in ['circle']:
184+
raise ValueError(
185+
"Kernel shape must be \'circle\'")
186+
187+
def _validate_radius(self):
188+
# try to convert into Distance object
189+
d = Distance(str(self.radius))
190+
191+
def to_array(self, raster):
192+
# calculate cell size over the x and y axis
193+
sx, sy = _calc_cell_size(raster)
194+
# create Distance object of radius
195+
sr = Distance(str(self.radius))
196+
if self.shape == 'circle':
197+
# convert radius (meter) to pixel
198+
kernel_half_w = int(sr.meters / sx.meters)
199+
kernel_half_h = int(sr.meters / sy.meters)
200+
kernel = _gen_ellipse_kernel(kernel_half_w, kernel_half_h)
201+
return kernel
9202

10203

11204
@ngjit
@@ -30,6 +223,7 @@ def _mean(data, excludes):
30223
out[y, x] = data[y, x]
31224
return out
32225

226+
33227
# TODO: add optional name parameter `name='mean'`
34228
def mean(agg, passes=1, excludes=[np.nan]):
35229
"""
@@ -53,3 +247,164 @@ def mean(agg, passes=1, excludes=[np.nan]):
53247

54248
return DataArray(out, name='mean',
55249
dims=agg.dims, coords=agg.coords, attrs=agg.attrs)
250+
251+
252+
@ngjit
253+
def calc_mean(array):
254+
return np.nanmean(array)
255+
256+
257+
@ngjit
258+
def calc_sum(array):
259+
return np.nansum(array)
260+
261+
262+
@ngjit
263+
def upper_bound_p_value(zscore):
264+
if abs(zscore) >= 2.33:
265+
return 0.0099
266+
if abs(zscore) >= 1.65:
267+
return 0.0495
268+
if abs(zscore) >= 1.29:
269+
return 0.0985
270+
return 1
271+
272+
273+
@ngjit
274+
def _hot_cold(zscore):
275+
if zscore > 0:
276+
return 1
277+
if zscore < 0:
278+
return -1
279+
return 0
280+
281+
282+
@ngjit
283+
def _confidence(zscore):
284+
p_value = upper_bound_p_value(zscore)
285+
if abs(zscore) > 2.58 and p_value < 0.01:
286+
return 99
287+
if abs(zscore) > 1.96 and p_value < 0.05:
288+
return 95
289+
if abs(zscore) > 1.65 and p_value < 0.1:
290+
return 90
291+
return 0
292+
293+
294+
@ngjit
295+
def _apply(data, kernel_array, func):
296+
out = np.zeros_like(data)
297+
rows, cols = data.shape
298+
krows, kcols = kernel_array.shape
299+
hrows, hcols = int(krows / 2), int(kcols / 2)
300+
kernel_values = np.zeros_like(kernel_array, dtype=data.dtype)
301+
302+
for y in prange(rows):
303+
for x in prange(cols):
304+
# kernel values are all nans at the beginning of each step
305+
kernel_values.fill(np.nan)
306+
for ky in range(y - hrows, y + hrows + 1):
307+
for kx in range(x - hcols, x + hcols + 1):
308+
if ky >= 0 and kx >= 0:
309+
if ky >= 0 and ky < rows and kx >= 0 and kx < cols:
310+
if kernel_array[ky - (y - hrows), kx - (x - hcols)] == 1:
311+
kernel_values[ky - (y - hrows), kx - (x - hcols)] = data[ky, kx]
312+
out[y, x] = func(kernel_values)
313+
return out
314+
315+
316+
def apply(raster, kernel, func=calc_mean):
317+
# validate raster
318+
if not isinstance(raster, DataArray):
319+
raise TypeError("`raster` must be instance of DataArray")
320+
321+
if raster.ndim != 2:
322+
raise ValueError("`raster` must be 2D")
323+
324+
if not (issubclass(raster.values.dtype.type, np.integer) or
325+
issubclass(raster.values.dtype.type, np.float)):
326+
raise ValueError(
327+
"`raster` must be an array of integers or float")
328+
329+
# create kernel mask array
330+
kernel_values = kernel.to_array(raster)
331+
# apply kernel to raster values
332+
out = _apply(raster.values.astype(float), kernel_values, func)
333+
334+
result = DataArray(out,
335+
coords=raster.coords,
336+
dims=raster.dims,
337+
attrs=raster.attrs)
338+
339+
return result
340+
341+
342+
@ngjit
343+
def _hotspots(z_array):
344+
out = np.zeros_like(z_array, dtype=np.int8)
345+
rows, cols = z_array.shape
346+
for y in prange(rows):
347+
for x in prange(cols):
348+
out[y, x] = _hot_cold(z_array[y, x]) * _confidence(z_array[y, x])
349+
return out
350+
351+
352+
def hotspots(raster, kernel):
353+
"""Identify statistically significant hot spots and cold spots in an input
354+
raster. To be a statistically significant hot spot, a feature will have a
355+
high value and be surrounded by other features with high values as well.
356+
Neighborhood of a feature defined by the input kernel, which currently
357+
support a shape of circle and a radius in meters.
358+
359+
The result should be a raster with the following 7 values:
360+
90 for 90% confidence high value cluster
361+
95 for 95% confidence high value cluster
362+
99 for 99% confidence high value cluster
363+
-90 for 90% confidence low value cluster
364+
-95 for 95% confidence low value cluster
365+
-99 for 99% confidence low value cluster
366+
0 for no significance
367+
368+
Parameters
369+
----------
370+
raster: xarray.DataArray
371+
Input raster image with shape=(height, width)
372+
kernel: Kernel
373+
374+
Returns
375+
-------
376+
hotspots: xarray.DataArray
377+
"""
378+
379+
# validate raster
380+
if not isinstance(raster, DataArray):
381+
raise TypeError("`raster` must be instance of DataArray")
382+
383+
if raster.ndim != 2:
384+
raise ValueError("`raster` must be 2D")
385+
386+
if not (issubclass(raster.values.dtype.type, np.integer) or
387+
issubclass(raster.values.dtype.type, np.float)):
388+
raise ValueError(
389+
"`raster` must be an array of integers or float")
390+
391+
# create kernel mask array
392+
kernel_values = kernel.to_array(raster)
393+
# apply kernel to raster values
394+
mean_array = _apply(raster.values.astype(float), kernel_values, calc_mean)
395+
396+
# calculate z-scores
397+
global_mean = np.nanmean(raster.values)
398+
global_std = np.nanstd(raster.values)
399+
if global_std == 0:
400+
raise ZeroDivisionError("Standard deviation "
401+
"of the input raster values is 0.")
402+
z_array = (mean_array - global_mean) / global_std
403+
404+
out = _hotspots(z_array)
405+
result = DataArray(out,
406+
coords=raster.coords,
407+
dims=raster.dims,
408+
attrs=raster.attrs)
409+
410+
return result

0 commit comments

Comments
 (0)