Skip to content

Commit 02fc4e6

Browse files
tonioshikanlugguuss
authored andcommitted
Add bulk loading Python Sample [(#2295)](GoogleCloudPlatform/python-docs-samples#2295)
Adds the following functionality: * Create bulk_load_csv * Delete bulk_load_csv * Create schema.ddl
1 parent cd34847 commit 02fc4e6

File tree

7 files changed

+17450
-0
lines changed

7 files changed

+17450
-0
lines changed
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
Google Cloud Spanner: Bulk Loading From CSV Python Sample
2+
===============
3+
4+
``Google Cloud Spanner`` is a highly scalable, transactional, managed, NewSQL database service.
5+
Cloud Spanner solves the need for a horizontally-scaling database with consistent global transactions and SQL semantics.
6+
7+
This application demonstrates how to load data from a csv file into a Cloud
8+
Spanner database.
9+
10+
The data contained in the csv files is sourced from the "Hacker News - Y Combinator" Bigquery `public dataset`_.
11+
12+
.. _public dataset :
13+
https://cloud.google.com/bigquery/public-data/
14+
15+
Pre-requisuite
16+
-----------------------
17+
Create a database in your Cloud Spanner instance using the `schema`_ in the folder.
18+
19+
.. _schema:
20+
schema.ddl
21+
22+
Setup
23+
------------------------
24+
25+
Authentication
26+
++++++++++++++
27+
28+
This sample requires you to have authentication setup. Refer to the
29+
`Authentication Getting Started Guide`_ for instructions on setting up
30+
credentials for applications.
31+
32+
.. _Authentication Getting Started Guide:
33+
https://cloud.google.com/docs/authentication/getting-started
34+
35+
Install Dependencies
36+
++++++++++++++++++++
37+
38+
#. Install `pip`_ and `virtualenv`_ if you do not already have them. You may want to refer to the `Python Development Environment Setup Guide`_ for Google Cloud Platform for instructions.
39+
40+
.. _Python Development Environment Setup Guide:
41+
https://cloud.google.com/python/setup
42+
43+
#. Create a virtualenv. Samples are compatible with Python 2.7 and 3.4+.
44+
45+
MACOS/LINUX
46+
47+
.. code-block:: bash
48+
49+
$ virtualenv env
50+
$ source env/bin/activate
51+
52+
WINDOWS
53+
54+
.. code-block:: bash
55+
56+
> virtualenv env
57+
> .\env\Scripts\activate
58+
59+
#. Install the dependencies needed to run the samples.
60+
61+
.. code-block:: bash
62+
63+
$ pip install -r requirements.txt
64+
65+
.. _pip: https://pip.pypa.io/
66+
.. _virtualenv: https://virtualenv.pypa.io/
67+
68+
69+
To run sample
70+
-----------------------
71+
72+
$ python batch_import.py instance_id database_id
73+
74+
positional arguments:
75+
instance_id: Your Cloud Spanner instance ID.
76+
77+
database_id : Your Cloud Spanner database ID.
Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
# Copyright 2019 Google Inc. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
# This application demonstrates how to do batch operations from a csv file
16+
# using Cloud Spanner.
17+
# For more information, see the README.rst.
18+
19+
20+
import csv
21+
import time
22+
import threading
23+
import argparse
24+
from google.cloud import spanner
25+
26+
27+
def is_bool_null(file):
28+
# This function convertes the boolean values
29+
# in the dataset from strings to boolean data types.
30+
# It also converts the string Null to a None data
31+
# type indicating an empty cell.
32+
data = list(csv.reader(file))
33+
# Reads each line in the csv file.
34+
for line in range(len(data)):
35+
for cell in range(len(data[line])):
36+
# Changes the string to boolean.
37+
if data[line][cell] == 'true':
38+
data[line][cell] = eval('True')
39+
# Changes blank string to python readable None type.
40+
if data[line][cell] == '':
41+
data[line][cell] = None
42+
return (data)
43+
44+
45+
def divide_chunks(lst, n):
46+
# This function divides the csv file into chunks so that the mutation will
47+
# commit every 500 rows.
48+
for i in range(0, len(lst), n):
49+
yield lst[i:i + n]
50+
51+
52+
def insert_data(database, filepath, table_name, column_names):
53+
# This function iterates over the list of files
54+
# belonging to the dataset and,
55+
# writes each line into cloud spanner using the batch mutation function.
56+
with open(filepath, newline='') as file:
57+
data = is_bool_null(file)
58+
data = tuple(data)
59+
l_group = list(divide_chunks(data, 500))
60+
# Inserts each chunk of data into database
61+
for current_inserts in (l_group):
62+
if current_inserts is not None:
63+
with database.batch() as batch:
64+
batch.insert(
65+
table=table_name,
66+
columns=column_names,
67+
values=current_inserts)
68+
69+
70+
def main(instance_id, database_id):
71+
# Inserts sample data into the given database.
72+
# The database and table must already exist and can be created
73+
# using`create_database`.
74+
start = time.time()
75+
# File paths
76+
comments_file = 'hnewscomments.txt'
77+
stories_file = 'hnewsstories.txt'
78+
# Instantiates a spanner client
79+
spanner_client = spanner.Client()
80+
instance = spanner_client.instance(instance_id)
81+
database = instance.database(database_id)
82+
# Sets the Column names.
83+
s_columnnames = (
84+
'id',
85+
'by',
86+
'author',
87+
'dead',
88+
'deleted',
89+
'descendants',
90+
'score',
91+
'text',
92+
'time',
93+
'time_ts',
94+
'title',
95+
'url',
96+
)
97+
c_columnnames = (
98+
'id',
99+
'by',
100+
'author',
101+
'dead',
102+
'deleted',
103+
'parent',
104+
'ranking',
105+
'text',
106+
'time',
107+
'time_ts',
108+
)
109+
# Creates threads
110+
t1 = threading.Thread(
111+
target=insert_data,
112+
args=(database, stories_file, 'stories', s_columnnames))
113+
t2 = threading.Thread(
114+
target=insert_data,
115+
args=(database, comments_file, 'comments', c_columnnames))
116+
# Starting threads
117+
t1.start()
118+
t2.start()
119+
# Wait until all threads finish
120+
t1.join()
121+
t2.join()
122+
123+
print('Finished Inserting Data.')
124+
end = time.time()
125+
print('Time: ', end - start)
126+
127+
128+
if __name__ == '__main__':
129+
parser = argparse.ArgumentParser(
130+
formatter_class=argparse.RawDescriptionHelpFormatter)
131+
parser.add_argument('instance_id', help='Your Cloud Spanner instance ID.')
132+
parser.add_argument('database_id', help='Your Cloud Spanner database ID.')
133+
134+
args = parser.parse_args()
135+
136+
main(args.instance_id, args.database_id)
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# Copyright 2019 Google Inc. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
# This application demonstrates how to do batch operations from a csv file
16+
# using Cloud Spanner.
17+
# For more information, see the README.rst.
18+
"""Test for batch_import"""
19+
import os
20+
import pytest
21+
import batch_import
22+
from google.cloud import spanner
23+
24+
25+
INSTANCE_ID = os.environ['SPANNER_INSTANCE']
26+
DATABASE_ID = 'hnewsdb'
27+
28+
29+
@pytest.fixture(scope='module')
30+
def spanner_instance():
31+
spanner_client = spanner.Client()
32+
return spanner_client.instance(INSTANCE_ID)
33+
34+
35+
@pytest.fixture
36+
def example_database():
37+
spanner_client = spanner.Client()
38+
instance = spanner_client.instance(INSTANCE_ID)
39+
database = instance.database(DATABASE_ID)
40+
41+
if not database.exists():
42+
with open('schema.ddl', 'r') as myfile:
43+
schema = myfile.read()
44+
database = instance.database(DATABASE_ID, ddl_statements=[schema])
45+
database.create()
46+
47+
yield database
48+
database.drop()
49+
50+
51+
def test_is_bool_null():
52+
assert batch_import.is_bool_null(['12', 'true', '', '12',
53+
'jkl', '']) == [['12'], [True],
54+
[], ['12'],
55+
['jkl'], []]
56+
57+
58+
def test_divide_chunks():
59+
res = list(batch_import.divide_chunks(['12', 'true', '', '12',
60+
'jkl', ''], 2))
61+
assert res == [['12', 'true'], ['', '12'], ['jkl', '']]
62+
63+
64+
def test_insert_data(capsys):
65+
batch_import.main(INSTANCE_ID, DATABASE_ID)
66+
out, _ = capsys.readouterr()
67+
assert 'Finished Inserting Data.' in out

0 commit comments

Comments
 (0)