diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 0000000..10b4dd1
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,24 @@
+name: Check code standards and run tests
+
+on: [push, pull_request]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.8'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt -r requirements-dev.txt
+    - name: Run pre-commit
+      uses: pre-commit/action@v2.0.0
+    - name: Test with pytest and coverage
+      run: |
+        pytest -v --cov=sgkit_plink --cov-report term-missing
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 0000000..6e3fc84
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,27 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.5.0
+    hooks:
+      - id: check-merge-conflict
+      - id: debug-statements
+      - id: mixed-line-ending
+      - id: check-case-conflict
+      - id: check-yaml
+  - repo: https://github.com/asottile/seed-isort-config
+    rev: v2.2.0
+    hooks:
+      - id: seed-isort-config
+  - repo: https://github.com/timothycrosley/isort
+    rev: 4.3.21-2  # pick the isort version you'd like to use from https://github.com/timothycrosley/isort/releases
+    hooks:
+      - id: isort
+  - repo: https://github.com/python/black
+    rev: 19.10b0
+    hooks:
+      - id: black
+        language_version: python3
+  - repo: https://gitlab.com/pycqa/flake8
+    rev: 3.7.9
+    hooks:
+      - id: flake8
+        language_version: python3
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..7d7c36c
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,8 @@
+black
+flake8
+isort
+mypy
+pre-commit
+pytest
+pytest-cov
+pytest-datadir
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..c2e0d4c
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,4 @@
+numpy
+xarray
+pysnptools>=0.4.19
+-e git://github.com/pystatgen/sgkit.git#egg=sgkit
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 0000000..9a875f7
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,30 @@
+[coverage:report]
+fail_under = 100
+
+[flake8]
+ignore =
+    # whitespace before ':' - doesn't work well with black
+    E203
+    E402
+    # line too long - let black worry about that
+    E501
+    # do not assign a lambda expression, use a def
+    E731
+    # line break before binary operator
+    W503
+
+[isort]
+default_section = THIRDPARTY
+known_first_party = sgkit
+known_third_party = dask,numpy,pysnptools,pytest,xarray
+multi_line_output = 3
+include_trailing_comma = True
+force_grid_wrap = 0
+use_parentheses = True
+line_length = 88
+
+[mypy-numpy.*]
+ignore_missing_imports = True
+
+[mypy-sgkit_plink.tests.*]
+disallow_untyped_defs = False
diff --git a/sgkit_plink/__init__.py b/sgkit_plink/__init__.py
new file mode 100644
index 0000000..32e558e
--- /dev/null
+++ b/sgkit_plink/__init__.py
@@ -0,0 +1,3 @@
+from .pysnptools import read_plink  # noqa: F401
+
+__all__ = ["read_plink"]
diff --git a/sgkit_plink/pysnptools.py b/sgkit_plink/pysnptools.py
new file mode 100644
index 0000000..08bd079
--- /dev/null
+++ b/sgkit_plink/pysnptools.py
@@ -0,0 +1,295 @@
+"""PLINK 1.9 reader implementation"""
+from pathlib import Path
+from typing import Union
+
+import dask.array as da
+import dask.dataframe as dd
+import numpy as np
+from dask.dataframe import DataFrame
+from pysnptools.snpreader import Bed
+from xarray import Dataset
+
+from sgkit import create_genotype_call_dataset
+from sgkit.api import DIM_SAMPLE
+
+PathType = Union[str, Path]
+
+FAM_FIELDS = [
+    ("family_id", str, "U"),
+    ("member_id", str, "U"),
+    ("paternal_id", str, "U"),
+    ("maternal_id", str, "U"),
+    ("sex", str, "int8"),
+    ("phenotype", str, "int8"),
+]
+FAM_DF_DTYPE = dict([(f[0], f[1]) for f in FAM_FIELDS])
+FAM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in FAM_FIELDS])
+
+BIM_FIELDS = [
+    ("contig", str, "U"),
+    ("variant_id", str, "U"),
+    ("cm_pos", "float32", "float32"),
+    ("pos", "int32", "int32"),
+    ("a1", str, "U"),
+    ("a2", str, "U"),
+]
+BIM_DF_DTYPE = dict([(f[0], f[1]) for f in BIM_FIELDS])
+BIM_ARRAY_DTYPE = dict([(f[0], f[2]) for f in BIM_FIELDS])
+
+
+class BedReader(object):
+    def __init__(self, path, shape, dtype=np.int8, count_A1=True):
+        # n variants (sid = SNP id), n samples (iid = Individual id)
+        n_sid, n_iid = shape
+        # Initialize Bed with empty arrays for axis data, otherwise it will
+        # load the bim/map/fam files entirely into memory (it does not do out-of-core for those)
+        self.bed = Bed(
+            str(path),
+            count_A1=count_A1,
+            # Array (n_sample, 2) w/ FID and IID
+            iid=np.empty((n_iid, 2), dtype="str"),
+            # SNP id array (n_variants)
+            sid=np.empty((n_sid,), dtype="str"),
+            # Contig and positions array (n_variants, 3)
+            pos=np.empty((n_sid, 3), dtype="int"),
+        )
+        self.shape = (n_sid, n_iid, 2)
+        self.dtype = dtype
+        self.ndim = 3
+
+    def __getitem__(self, idx):
+        if not isinstance(idx, tuple):
+            raise IndexError(  # pragma: no cover
+                f"Indexer must be tuple (received {type(idx)})"
+            )
+        if len(idx) != self.ndim:
+            raise IndexError(  # pragma: no cover
+                f"Indexer must be two-item tuple (received {len(idx)} slices)"
+            )
+        # Slice using reversal of first two slices --
+        # pysnptools uses sample x variant orientation
+        arr = self.bed[idx[1::-1]].read(dtype=np.float32, view_ok=False).val.T
+        # Convert missing calls as nan to -1
+        arr = np.nan_to_num(arr, nan=-1.0)
+        arr = arr.astype(self.dtype)
+        # Add a ploidy dimension, so allele counts of 0, 1, 2 correspond to 00, 10, 11
+        arr = np.stack(
+            [
+                np.where(arr < 0, -1, np.where(arr == 0, 0, 1)),
+                np.where(arr < 0, -1, np.where(arr == 2, 1, 0)),
+            ],
+            axis=-1,
+        )
+
+        # Apply final slice to 3D result
+        return arr[:, :, idx[-1]]
+
+    def close(self):
+        # This is not actually crucial since a Bed instance with no
+        # in-memory bim/map/fam data is essentially just a file pointer
+        # but this will still be problematic if the an array is created
+        # from the same PLINK dataset many times
+        self.bed._close_bed()  # pragma: no cover
+
+
+def _to_dict(df, dtype=None):
+    return {
+        c: df[c].to_dask_array(lengths=True).astype(dtype[c] if dtype else df[c].dtype)
+        for c in df
+    }
+
+
+def read_fam(path: PathType, sep: str = " ") -> DataFrame:
+    # See: https://www.cog-genomics.org/plink/1.9/formats#fam
+    names = [f[0] for f in FAM_FIELDS]
+    df = dd.read_csv(str(path) + ".fam", sep=sep, names=names, dtype=FAM_DF_DTYPE)
+
+    def coerce_code(v, codes):
+        # Set non-ints and unexpected codes to missing (-1)
+        v = dd.to_numeric(v, errors="coerce")
+        v = v.where(v.isin(codes), np.nan)
+        return v.fillna(-1).astype("int8")
+
+    df["paternal_id"] = df["paternal_id"].where(df["paternal_id"] != "0", None)
+    df["maternal_id"] = df["maternal_id"].where(df["maternal_id"] != "0", None)
+    df["sex"] = coerce_code(df["sex"], [1, 2])
+    df["phenotype"] = coerce_code(df["phenotype"], [1, 2])
+
+    return df
+
+
+def read_bim(path: PathType, sep: str = "\t") -> DataFrame:
+    # See: https://www.cog-genomics.org/plink/1.9/formats#bim
+    names = [f[0] for f in BIM_FIELDS]
+    df = dd.read_csv(str(path) + ".bim", sep=sep, names=names, dtype=BIM_DF_DTYPE)
+    df["contig"] = df["contig"].where(df["contig"] != "0", None)
+    return df
+
+
+def encode_array(x):
+    """Encode array values as integers indexing unique values
+
+    The codes created for each unique element in the array correspond
+    to order of appearance, not the natural sort order for the array
+    dtype.
+
+    Examples
+    --------
+
+    >>> encode_array(['c', 'a', 'a', 'b'])
+    (array([0, 1, 1, 2]), array(['c', 'a', 'b'], dtype='<U1'))
+
+    Parameters
+    ----------
+    x : (M,) array-like
+        Array of elements to encode of any type
+
+    Returns
+    -------
+    indexes : (M,) ndarray
+        Encoded values as integer indices
+    values : ndarray
+        Unique values in original array in order of appearance
+    """
+    # argsort not implemented in dask: https://github.com/dask/dask/issues/4368
+    names, index, inverse = np.unique(x, return_index=True, return_inverse=True)
+    index = np.argsort(index)
+    rank = np.empty_like(index)
+    rank[index] = np.arange(len(index))
+    return rank[inverse], names[index]
+
+
+def read_plink(
+    path: PathType,
+    chunks: Union[str, int, tuple] = "auto",
+    fam_sep: str = " ",
+    bim_sep: str = "\t",
+    bim_int_contig: bool = False,
+    count_a1: bool = True,
+    lock: bool = False,
+    persist: bool = True,
+) -> Dataset:
+    """Read PLINK dataset
+
+    Loads a single PLINK dataset as dask arrays within a Dataset
+    from bed, bim, and fam files.
+
+    Parameters
+    ----------
+    path : PathType
+        Path to PLINK file set.
+        This should not include a suffix, i.e. if the files are
+        at `data.{bed,fam,bim}` then only 'data' should be
+        provided (suffixes are added internally).
+    chunks : Union[str, int, tuple], optional
+        Chunk size for genotype (i.e. `.bed`) data, by default "auto"
+    fam_sep : str, optional
+        Delimiter for `.fam` file, by default " "
+    bim_sep : str, optional
+        Delimiter for `.bim` file, by default "\t"
+    bim_int_contig : bool, optional
+        Whether or not the contig/chromosome name in the `.bim`
+        file should be interpreted as an integer, by default False.
+        If False, then the `variant/contig` field in the resulting
+        dataset will contain the indexes of corresponding strings
+        encountered in the first `.bim` field.
+    count_a1 : bool, optional
+        Whether or not allele counts should be for A1 or A2,
+        by default True. Typically A1 is the minor allele
+        and should be counted instead of A2.  This is not enforced
+        by PLINK though and it is up to the data generating process
+        to ensure that A1 is in fact an alternate/minor/effect
+        allele. See https://www.cog-genomics.org/plink/1.9/formats
+        for more details.
+    lock : bool, optional
+        Whether or not to synchronize concurrent reads of `.bed`
+        file blocks, by default False. This is passed through to
+        [dask.array.from_array](https://docs.dask.org/en/latest/array-api.html#dask.array.from_array).
+    persist : bool, optional
+        Whether or not to persist `.fam` and `.bim` information in
+        memory, by default True.  This is an important performance
+        consideration as the plain text files for this data will
+        be read multiple times when False. This can lead to load
+        times that are upwards of 10x slower.
+
+    Returns
+    -------
+    Dataset
+        A dataset containing genotypes as 3 dimensional calls along with
+        all accompanying pedigree and variant information. The content
+        of this dataset matches that of `sgkit.create_genotype_call_dataset`
+        with all pedigree-specific fields defined as:
+            - sample/family_id: Family identifier commonly referred to as FID
+            - sample/id: Within-family identifier for sample
+            - sample/paternal_id: Within-family identifier for father of sample
+            - sample/maternal_id: Within-family identifier for mother of sample
+            - sample/sex: Sex code equal to 1 for male, 2 for female, and -1
+                for missing
+            - sample/phenotype: Phenotype code equal to 1 for control, 2 for case,
+                and -1 for missing
+
+        See https://www.cog-genomics.org/plink/1.9/formats#fam for more details.
+    """
+
+    # Load axis data first to determine dimension sizes
+    df_fam = read_fam(path, sep=fam_sep)
+    df_bim = read_bim(path, sep=bim_sep)
+
+    if persist:
+        df_fam = df_fam.persist()
+        df_bim = df_bim.persist()
+
+    arr_fam = _to_dict(df_fam, dtype=FAM_ARRAY_DTYPE)
+    arr_bim = _to_dict(df_bim, dtype=BIM_ARRAY_DTYPE)
+
+    # Load genotyping data
+    call_genotype = da.from_array(
+        # Make sure to use asarray=False in order for masked arrays to propagate
+        BedReader(path, (len(df_bim), len(df_fam)), count_A1=count_a1),
+        chunks=chunks,
+        # Lock must be true with multiprocessing dask scheduler
+        # to not get pysnptools errors (it works w/ threading backend though)
+        lock=lock,
+        asarray=False,
+        name=f"pysnptools:read_plink:{path}",
+    )
+
+    # If contigs are already integers, use them as-is
+    if bim_int_contig:
+        variant_contig = arr_bim["contig"].astype("int16")
+        variant_contig_names = da.unique(variant_contig).astype(str)
+        variant_contig_names = list(variant_contig_names.compute())
+    # Otherwise create index for contig names based
+    # on order of appearance in underlying .bim file
+    else:
+        variant_contig, variant_contig_names = encode_array(arr_bim["contig"].compute())
+        variant_contig = variant_contig.astype("int16")
+        variant_contig_names = list(variant_contig_names)
+
+    variant_position = arr_bim["pos"]
+    a1 = arr_bim["a1"].astype("str")
+    a2 = arr_bim["a2"].astype("str")
+
+    # Note: column_stack not implemented in Dask, must use [v|h]stack
+    variant_alleles = da.hstack((a1[:, np.newaxis], a2[:, np.newaxis]))
+    # TODO: Why use bytes for this instead of string?
+    variant_alleles = variant_alleles.astype("S")
+    variant_id = arr_bim["variant_id"]
+
+    sample_id = arr_fam["member_id"]
+
+    ds = create_genotype_call_dataset(
+        variant_contig_names=variant_contig_names,
+        variant_contig=variant_contig,
+        variant_position=variant_position,
+        variant_alleles=variant_alleles,
+        sample_id=sample_id,
+        call_genotype=call_genotype,
+        variant_id=variant_id,
+    )
+
+    # Assign PLINK-specific pedigree fields
+    ds = ds.assign(
+        **{f"sample/{f}": (DIM_SAMPLE, arr_fam[f]) for f in arr_fam if f != "member_id"}
+    )
+    return ds
diff --git a/sgkit_plink/tests/__init__.py b/sgkit_plink/tests/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.bed b/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.bed
new file mode 100644
index 0000000..7cd4487
Binary files /dev/null and b/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.bed differ
diff --git a/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.bim b/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.bim
new file mode 100644
index 0000000..01325a9
--- /dev/null
+++ b/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.bim
@@ -0,0 +1,100 @@
+1	1:1:A:C	0.0	1	C	A
+1	1:2:A:C	0.0	2	C	A
+1	1:3:A:C	0.0	3	C	A
+1	1:4:A:C	0.0	4	C	A
+1	1:5:A:C	0.0	5	C	A
+1	1:6:A:C	0.0	6	C	A
+1	1:7:A:C	0.0	7	C	A
+1	1:8:A:C	0.0	8	C	A
+1	1:9:A:C	0.0	9	C	A
+1	1:10:A:C	0.0	10	C	A
+1	1:11:A:C	0.0	11	C	A
+1	1:12:A:C	0.0	12	C	A
+1	1:13:A:C	0.0	13	C	A
+1	1:14:A:C	0.0	14	C	A
+1	1:15:A:C	0.0	15	C	A
+1	1:16:A:C	0.0	16	C	A
+1	1:17:A:C	0.0	17	C	A
+1	1:18:A:C	0.0	18	C	A
+1	1:19:A:C	0.0	19	C	A
+1	1:20:A:C	0.0	20	C	A
+1	1:21:A:C	0.0	21	C	A
+1	1:22:A:C	0.0	22	C	A
+1	1:23:A:C	0.0	23	C	A
+1	1:24:A:C	0.0	24	C	A
+1	1:25:A:C	0.0	25	C	A
+1	1:26:A:C	0.0	26	C	A
+1	1:27:A:C	0.0	27	C	A
+1	1:28:A:C	0.0	28	C	A
+1	1:29:A:C	0.0	29	C	A
+1	1:30:A:C	0.0	30	C	A
+1	1:31:A:C	0.0	31	C	A
+1	1:32:A:C	0.0	32	C	A
+1	1:33:A:C	0.0	33	C	A
+1	1:34:A:C	0.0	34	C	A
+1	1:35:A:C	0.0	35	C	A
+1	1:36:A:C	0.0	36	C	A
+1	1:37:A:C	0.0	37	C	A
+1	1:38:A:C	0.0	38	C	A
+1	1:39:A:C	0.0	39	C	A
+1	1:40:A:C	0.0	40	C	A
+1	1:41:A:C	0.0	41	C	A
+1	1:42:A:C	0.0	42	C	A
+1	1:43:A:C	0.0	43	C	A
+1	1:44:A:C	0.0	44	C	A
+1	1:45:A:C	0.0	45	C	A
+1	1:46:A:C	0.0	46	C	A
+1	1:47:A:C	0.0	47	C	A
+1	1:48:A:C	0.0	48	C	A
+1	1:49:A:C	0.0	49	C	A
+1	1:50:A:C	0.0	50	C	A
+1	1:51:A:C	0.0	51	C	A
+1	1:52:A:C	0.0	52	C	A
+1	1:53:A:C	0.0	53	C	A
+1	1:54:A:C	0.0	54	C	A
+1	1:55:A:C	0.0	55	C	A
+1	1:56:A:C	0.0	56	C	A
+1	1:57:A:C	0.0	57	C	A
+1	1:58:A:C	0.0	58	C	A
+1	1:59:A:C	0.0	59	C	A
+1	1:60:A:C	0.0	60	C	A
+1	1:61:A:C	0.0	61	C	A
+1	1:62:A:C	0.0	62	C	A
+1	1:63:A:C	0.0	63	C	A
+1	1:64:A:C	0.0	64	C	A
+1	1:65:A:C	0.0	65	C	A
+1	1:66:A:C	0.0	66	C	A
+1	1:67:A:C	0.0	67	C	A
+1	1:68:A:C	0.0	68	C	A
+1	1:69:A:C	0.0	69	C	A
+1	1:70:A:C	0.0	70	C	A
+1	1:71:A:C	0.0	71	C	A
+1	1:72:A:C	0.0	72	C	A
+1	1:73:A:C	0.0	73	C	A
+1	1:74:A:C	0.0	74	C	A
+1	1:75:A:C	0.0	75	C	A
+1	1:76:A:C	0.0	76	C	A
+1	1:77:A:C	0.0	77	C	A
+1	1:78:A:C	0.0	78	C	A
+1	1:79:A:C	0.0	79	C	A
+1	1:80:A:C	0.0	80	C	A
+1	1:81:A:C	0.0	81	C	A
+1	1:82:A:C	0.0	82	C	A
+1	1:83:A:C	0.0	83	C	A
+1	1:84:A:C	0.0	84	C	A
+1	1:85:A:C	0.0	85	C	A
+1	1:86:A:C	0.0	86	C	A
+1	1:87:A:C	0.0	87	C	A
+1	1:88:A:C	0.0	88	C	A
+1	1:89:A:C	0.0	89	C	A
+1	1:90:A:C	0.0	90	C	A
+1	1:91:A:C	0.0	91	C	A
+1	1:92:A:C	0.0	92	C	A
+1	1:93:A:C	0.0	93	C	A
+1	1:94:A:C	0.0	94	C	A
+1	1:95:A:C	0.0	95	C	A
+1	1:96:A:C	0.0	96	C	A
+1	1:97:A:C	0.0	97	C	A
+1	1:98:A:C	0.0	98	C	A
+1	1:99:A:C	0.0	99	C	A
+1	1:100:A:C	0.0	100	C	A
diff --git a/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.fam b/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.fam
new file mode 100644
index 0000000..e0af7f4
--- /dev/null
+++ b/sgkit_plink/tests/data/plink_sim_10s_100v_10pmiss.fam
@@ -0,0 +1,10 @@
+0	0	0	0	0	NA
+0	1	0	0	0	NA
+0	2	0	0	0	NA
+0	3	0	0	0	NA
+0	4	0	0	0	NA
+0	5	0	0	0	NA
+0	6	0	0	0	NA
+0	7	0	0	0	NA
+0	8	0	0	0	NA
+0	9	0	0	0	NA
diff --git a/sgkit_plink/tests/test_pysnptools.py b/sgkit_plink/tests/test_pysnptools.py
new file mode 100644
index 0000000..7609587
--- /dev/null
+++ b/sgkit_plink/tests/test_pysnptools.py
@@ -0,0 +1,112 @@
+import numpy as np
+import pytest
+from sgkit_plink import pysnptools
+
+example_dataset_1 = "plink_sim_10s_100v_10pmiss"
+
+
+@pytest.fixture(params=[dict()])
+def ds1(shared_datadir, request):
+    path = shared_datadir / example_dataset_1
+    return pysnptools.read_plink(path, bim_sep="\t", fam_sep="\t", **request.param)
+
+
+def test_read_slicing(ds1):
+    gt = ds1["call/genotype"]
+    shape = gt.shape
+    assert gt[:3].shape == (3,) + shape[1:]
+    assert gt[:, :3].shape == shape[:1] + (3,) + shape[2:]
+    assert gt[:3, :5].shape == (3, 5) + shape[2:]
+    assert gt[:3, :5, :1].shape == (3, 5, 1)
+
+
+def test_encode_array():
+    def check(x, values, names):
+        v, n = pysnptools.encode_array(x)
+        np.testing.assert_equal(v, values)
+        np.testing.assert_equal(n, names)
+
+    check([], [], [])
+    check(["a"], [0], ["a"])
+    check(["a", "b"], [0, 1], ["a", "b"])
+    check(["b", "a"], [0, 1], ["b", "a"])
+    check(["a", "b", "b"], [0, 1, 1], ["a", "b"])
+    check(["b", "b", "a"], [0, 0, 1], ["b", "a"])
+    check(["b", "b", "a", "a"], [0, 0, 1, 1], ["b", "a"])
+    check(["c", "a", "a", "b"], [0, 1, 1, 2], ["c", "a", "b"])
+    check(["b", "b", "c", "c", "c", "a", "a"], [0, 0, 1, 1, 1, 2, 2], ["b", "c", "a"])
+    check(["b", "c", "b", "c", "a"], [0, 1, 0, 1, 2], ["b", "c", "a"])
+    check([2, 2, 1, 3, 1, 5, 5, 1], [0, 0, 1, 2, 1, 3, 3, 1], [2.0, 1.0, 3.0, 5.0])
+    check(
+        [2.0, 2.0, 1.0, 3.0, 1.0, 5.0, 5.0, 1.0],
+        [0, 0, 1, 2, 1, 3, 3, 1],
+        [2.0, 1.0, 3.0, 5.0],
+    )
+
+
+@pytest.mark.parametrize("ds1", [dict(bim_int_contig=True)], indirect=True)
+def test_read_int_contig(ds1):
+    # Test contig parse as int (the value is always "1" in .bed for ds1)
+    assert np.all(ds1["variant/contig"].values == 1)
+    assert ds1.attrs["contigs"] == ["1"]
+
+
+@pytest.mark.parametrize("ds1", [dict(bim_int_contig=False)], indirect=True)
+def test_read_str_contig(ds1):
+    # Test contig indexing as string (the value is always "1" in .bed for ds1)
+    assert np.all(ds1["variant/contig"].values == 0)
+    assert ds1.attrs["contigs"] == ["1"]
+
+
+def test_read_call_values(ds1):
+    # Validate a few randomly selected individual calls
+    # (spanning all possible states for a call)
+    idx = np.array(
+        [
+            [50, 7],
+            [81, 8],
+            [45, 2],
+            [36, 8],
+            [24, 2],
+            [92, 9],
+            [26, 2],
+            [81, 0],
+            [31, 8],
+            [4, 9],
+        ]
+    )
+    expected = np.array(
+        [
+            [1, 0],
+            [1, 0],
+            [1, 1],
+            [1, 1],
+            [-1, -1],
+            [0, 0],
+            [0, 0],
+            [1, 1],
+            [0, 0],
+            [0, 0],
+        ]
+    )
+    gt = ds1["call/genotype"].values
+    actual = gt[tuple(idx.T)]
+    np.testing.assert_equal(actual, expected)
+
+
+def test_read_stat_call_rate(ds1):
+    # Validate call rate for each sample
+    sample_call_rates = (
+        (ds1["call/genotype"] >= 0).max(dim="ploidy").mean(dim="variants").values
+    )
+    np.testing.assert_equal(
+        sample_call_rates, [0.95, 0.9, 0.91, 0.87, 0.86, 0.83, 0.86, 0.87, 0.92, 0.92]
+    )
+
+
+def test_read_stat_alt_alleles(ds1):
+    # Validate alt allele sum for each sample
+    n_alt_alleles = (
+        ds1["call/genotype"].clip(0, 2).sum(dim="ploidy").sum(dim="variants").values
+    )
+    np.testing.assert_equal(n_alt_alleles, [102, 95, 98, 94, 88, 91, 90, 98, 96, 103])