Skip to content

Commit cb46cec

Browse files
authored
Breakout user-guide, add num_sample param option to natural_breaks(), fix slow import (#123)
* break out user-guide * add reference * natural breaks: add num_sample param option * user guide: add classification notebook * remove scikit-learn dep from setup * viewshed: use lazy-compilation to fix slow import * focal: raise error if non positive distance * focal: functional programming, condense tests
1 parent bcf39ad commit cb46cec

13 files changed

+2677
-2009
lines changed
+75
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Xarray-spatial\n",
8+
"### User Guide: Getting setup"
9+
]
10+
},
11+
{
12+
"cell_type": "markdown",
13+
"metadata": {},
14+
"source": [
15+
"### Installation\n",
16+
"\n",
17+
"The package can be install easily via conda or pip. \n",
18+
"\n",
19+
"#### To install this package with conda run:\n",
20+
"- conda install -c conda-forge xarray-spatial\n",
21+
"- conda install -c makepath xarray-spatial\n",
22+
"\n",
23+
"#### To install with pip run:\n",
24+
"- pip install xarray-spatial\n",
25+
"\n",
26+
"#### To verify whether the installation successed, open a Python session and import the package:\n",
27+
"- import xrspatial"
28+
]
29+
}
30+
],
31+
"metadata": {
32+
"kernelspec": {
33+
"display_name": "Python 3",
34+
"language": "python",
35+
"name": "python3"
36+
},
37+
"language_info": {
38+
"codemirror_mode": {
39+
"name": "ipython",
40+
"version": 3
41+
},
42+
"file_extension": ".py",
43+
"mimetype": "text/x-python",
44+
"name": "python",
45+
"nbconvert_exporter": "python",
46+
"pygments_lexer": "ipython3",
47+
"version": "3.6.10"
48+
},
49+
"nbTranslate": {
50+
"displayLangs": [
51+
"*"
52+
],
53+
"hotkey": "alt-t",
54+
"langInMainMenu": true,
55+
"sourceLang": "en",
56+
"targetLang": "fr",
57+
"useGoogleTranslate": true
58+
},
59+
"toc": {
60+
"base_numbering": 1,
61+
"nav_menu": {},
62+
"number_sections": true,
63+
"sideBar": true,
64+
"skip_h1_title": false,
65+
"title_cell": "Table of Contents",
66+
"title_sidebar": "Contents",
67+
"toc_cell": false,
68+
"toc_position": {},
69+
"toc_section_display": true,
70+
"toc_window_display": false
71+
}
72+
},
73+
"nbformat": 4,
74+
"nbformat_minor": 2
75+
}

examples/user-guide.ipynb renamed to examples/user_guide/1_Surface.ipynb

+167-1,646
Large diffs are not rendered by default.

examples/user_guide/2_Proximity.ipynb

+557
Large diffs are not rendered by default.

examples/user_guide/3_Zonal.ipynb

+488
Large diffs are not rendered by default.

examples/user_guide/5_Classification.ipynb

+396
Large diffs are not rendered by default.

examples/user_guide/8_Remote_Sensing.ipynb

+343
Large diffs are not rendered by default.

examples/user_guide/9_Pathfinding.ipynb

+438
Large diffs are not rendered by default.

setup.py

-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
'pandas',
1919
'pillow',
2020
'requests',
21-
'scikit-learn',
2221
'scipy',
2322
'xarray',
2423
],

xrspatial/classify.py

+34-10
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
from datashader.colors import rgb
55
from datashader.utils import ngjit
66
from xarray import DataArray
7+
8+
from numpy.random import RandomState
9+
710
import warnings
811
warnings.simplefilter('default')
912

@@ -238,12 +241,7 @@ def _jenks(data, n_classes):
238241
return kclass
239242

240243

241-
def _kmeans(agg, k=5):
242-
centroids = _jenks(agg.data.flatten(), k)
243-
return centroids[1:]
244-
245-
246-
def natural_breaks(agg, name='natural_breaks', k=5):
244+
def natural_breaks(agg, num_sample=None, name='natural_breaks', k=5):
247245
"""
248246
Calculate Jenks natural breaks (a.k.a kmeans in one dimension)
249247
for an input raster xarray.
@@ -252,9 +250,14 @@ def natural_breaks(agg, name='natural_breaks', k=5):
252250
----------
253251
agg : xarray.DataArray
254252
xarray.DataArray of values to bin
253+
num_sample: int (optional)
254+
Number of sample data points used to fit the model.
255+
Natural Breaks (Jenks) classification is indeed O(n²) complexity,
256+
where n is the total number of data points, i.e: agg.size
257+
When n is large, we should fit the model on a small sub-sample
258+
of the data instead of using the whole dataset.
255259
k: int
256260
Number of classes
257-
258261
Returns
259262
-------
260263
natural_breaks_agg: xarray.DataArray
@@ -283,7 +286,28 @@ def natural_breaks(agg, name='natural_breaks', k=5):
283286
[4., 4., 4.]]
284287
"""
285288

286-
uv = np.unique(agg.data)
289+
num_data = agg.size
290+
291+
if num_sample is not None and num_sample < num_data:
292+
# randomly select sample from the whole dataset
293+
# create a pseudo random number generator
294+
generator = RandomState(1234567890)
295+
idx = [i for i in range(0, agg.size)]
296+
generator.shuffle(idx)
297+
sample_idx = idx[:num_sample]
298+
sample_data = agg.data.flatten()[sample_idx]
299+
else:
300+
sample_data = agg.data.flatten()
301+
302+
# warning if number of total data points to fit the model bigger than 40k
303+
if sample_data.size >= 40000:
304+
warnings.warn('natural_breaks Warning: Natural break classification '
305+
'(Jenks) has a complexity of O(n^2), '
306+
'your classification with {} data points may take '
307+
'a long time.'.format(sample_data.size),
308+
Warning)
309+
310+
uv = np.unique(sample_data)
287311
uvk = len(uv)
288312

289313
if uvk < k:
@@ -295,8 +319,8 @@ def natural_breaks(agg, name='natural_breaks', k=5):
295319
uv.sort()
296320
bins = uv
297321
else:
298-
res0 = _kmeans(agg, k)
299-
bins = np.array(res0)
322+
centroids = _jenks(sample_data, k)
323+
bins = np.array(centroids[1:])
300324

301325
return DataArray(_bin(agg.data, bins, np.arange(uvk)),
302326
name=name,

0 commit comments

Comments
 (0)