Skip to content

Commit 0888d50

Browse files
jluethitcompa
authored andcommitted
Update metadata parsing for #112
1 parent 21ce80d commit 0888d50

File tree

1 file changed

+147
-92
lines changed

1 file changed

+147
-92
lines changed

fractal/tasks/metadata_parsing.py

100644100755
Lines changed: 147 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,12 @@
1010
Institute for Biomedical Research and Pelkmans Lab from the University of
1111
Zurich.
1212
"""
13-
14-
from xml.etree import ElementTree
15-
import pandas as pd
1613
import warnings
14+
1715
import numpy as np
16+
import pandas as pd
17+
from defusedxml import ElementTree
18+
1819

1920
def parse_yokogawa_metadata(mrf_path, mlf_path):
2021
"""
@@ -28,27 +29,41 @@ def parse_yokogawa_metadata(mrf_path, mlf_path):
2829
mrf_frame, mlf_frame, error_count = read_metadata_files(mrf_path, mlf_path)
2930

3031
per_site_parameters = [
31-
'x_micrometer',
32-
'y_micrometer',
33-
'pixel_size_x',
34-
'pixel_size_y',
35-
'bit_depth'
36-
]
37-
grouping_params = ['well_id', 'field_id']
38-
grouped_sites = mlf_frame.loc[:, grouping_params + per_site_parameters
39-
].groupby(by = grouping_params)
32+
"x_micrometer",
33+
"y_micrometer",
34+
"pixel_size_x",
35+
"pixel_size_y",
36+
"x_pixel",
37+
"y_pixel",
38+
"bit_depth",
39+
]
40+
grouping_params = ["well_id", "field_id"]
41+
grouped_sites = mlf_frame.loc[
42+
:, grouping_params + per_site_parameters
43+
].groupby(by=grouping_params)
4044
check_grouped_sites_consistency(grouped_sites, per_site_parameters)
4145
site_metadata = grouped_sites.mean()
4246

47+
# Cast image pixel sizes & bit depth to int
48+
site_metadata = site_metadata.astype(
49+
{
50+
"x_pixel": "int",
51+
"y_pixel": "int",
52+
"bit_depth": "int",
53+
}
54+
)
55+
4356
# Absolute Z positions are not saved by the Yokogawa,
4457
# only relative positions to the autofocus
45-
site_metadata['z_micrometer'] = 0
58+
site_metadata["z_micrometer"] = 0
4659

4760
site_metadata = pd.concat([site_metadata, get_z_steps(mlf_frame)], axis=1)
4861

4962
if error_count > 0:
50-
print(f"Succesfully parsed {len(site_metadata)} sites, could not " \
51-
f"parse {error_count} sites due to errors (see warnings).")
63+
print(
64+
f"Succesfully parsed {len(site_metadata)} sites, could not "
65+
f"parse {error_count} sites due to errors (see warnings)."
66+
)
5267
total_files = len(mlf_frame)
5368
# TODO: Check whether the total_files correspond to the number of
5469
# relevant input images in the input folder. Returning it for now
@@ -62,9 +77,10 @@ def parse_yokogawa_metadata(mrf_path, mlf_path):
6277

6378

6479
def read_metadata_files(mrf_path, mlf_path):
65-
# parsing of mrf & mlf files are based on the yokogawa_image_collection_task
66-
# v0.5 in drogon, written by Dario Vischi. Now modified for Fractal use
67-
# https://github.com/fmi-basel/job-system-workflows/blob/00bbf34448972d27f258a2c28245dd96180e8229/src/gliberal_workflows/tasks/yokogawa_image_collection_task/versions/version_0_5.py
80+
# parsing of mrf & mlf files are based on the
81+
# yokogawa_image_collection_task v0.5 in drogon, written by Dario Vischi.
82+
# https://github.com/fmi-basel/job-system-workflows/blob/00bbf34448972d27f258a2c28245dd96180e8229/src/gliberal_workflows/tasks/yokogawa_image_collection_task/versions/version_0_5.py # noqa
83+
# Now modified for Fractal use
6884

6985
mrf_frame = read_mrf_file(mrf_path)
7086
# TODO: filter_position & filter_wheel_position are parsed, but not
@@ -78,25 +94,27 @@ def read_metadata_files(mrf_path, mlf_path):
7894
return mrf_frame, mlf_frame, error_count
7995

8096

81-
8297
def read_mrf_file(mrf_path):
8398

8499
# Prepare mrf dataframe
85100
mrf_columns = [
86-
"channel_id", "horiz_pixel_dim", "vert_pixel_dim",
87-
"camera_no", "bit_depth", "horiz_pixels", "vert_pixels",
88-
"filter_wheel_position", "filter_position", "shading_corr_src"
101+
"channel_id",
102+
"horiz_pixel_dim",
103+
"vert_pixel_dim",
104+
"camera_no",
105+
"bit_depth",
106+
"horiz_pixels",
107+
"vert_pixels",
108+
"filter_wheel_position",
109+
"filter_position",
110+
"shading_corr_src",
89111
]
90-
mrf_frame = pd.DataFrame(
91-
columns=mrf_columns
92-
)
112+
mrf_frame = pd.DataFrame(columns=mrf_columns)
93113

94114
mrf_xml = ElementTree.parse(mrf_path).getroot()
95115
# Read mrf file
96116
ns = {"bts": "http://www.yokogawa.co.jp/BTS/BTSSchema/1.0"}
97-
for channel in mrf_xml.findall(
98-
"bts:MeasurementChannel", namespaces=ns
99-
):
117+
for channel in mrf_xml.findall("bts:MeasurementChannel", namespaces=ns):
100118
mrf_frame.loc[channel.get("{%s}Ch" % ns["bts"])] = [
101119
channel.get("{%s}Ch" % ns["bts"]),
102120
float(channel.get("{%s}HorizontalPixelDimension" % ns["bts"])),
@@ -107,7 +125,7 @@ def read_mrf_file(mrf_path):
107125
int(channel.get("{%s}VerticalPixels" % ns["bts"])),
108126
int(channel.get("{%s}FilterWheelPosition" % ns["bts"])),
109127
int(channel.get("{%s}FilterPosition" % ns["bts"])),
110-
channel.get("{%s}ShadingCorrectionSource" % ns["bts"])
128+
channel.get("{%s}ShadingCorrectionSource" % ns["bts"]),
111129
]
112130

113131
return mrf_frame
@@ -126,28 +144,40 @@ def blocks(fh, size=65536):
126144
break
127145
yield block
128146

129-
with open(mlf_path, "r", encoding="utf-8", errors='ignore') as fh:
130-
line_counter = sum(block.count("\n") for block in blocks(fh))
131-
132147
mlf_entries = mlf_xml.findall("bts:MeasurementRecord", namespaces=ns)
133148
nb_lines = len(mlf_entries)
134149

135150
# Prepare mlf dataframe
136151
mlf_columns = [
137-
"type", "well_id", "column", "row", "time_point", "field_id",
138-
"z_index", "timeline_id", "action_id", "action",
139-
"x_micrometer", "y_micrometer", "z_micrometer",
140-
"pixel_size_x", "pixel_size_y", "bit_depth", "width", "height",
141-
"channel_id", 'camera_no', "file_name"
152+
"type",
153+
"well_id",
154+
"column",
155+
"row",
156+
"time_point",
157+
"field_id",
158+
"z_index",
159+
"timeline_id",
160+
"action_id",
161+
"action",
162+
"x_micrometer",
163+
"y_micrometer",
164+
"z_micrometer",
165+
"x_pixel",
166+
"y_pixel",
167+
"pixel_size_x",
168+
"pixel_size_y",
169+
"bit_depth",
170+
"width",
171+
"height",
172+
"channel_id",
173+
"camera_no",
174+
"file_name",
142175
]
143-
mlf_frame = pd.DataFrame(
144-
columns=mlf_columns, index=range(0, nb_lines)
145-
)
176+
mlf_frame = pd.DataFrame(columns=mlf_columns, index=range(0, nb_lines))
146177

147178
mrf_channel_indices = {
148-
row.channel_id: idx for idx, (_, row) in enumerate(
149-
mrf_frame.iterrows()
150-
)
179+
row.channel_id: idx
180+
for idx, (_, row) in enumerate(mrf_frame.iterrows())
151181
}
152182

153183
error_count = 0
@@ -180,17 +210,15 @@ def blocks(fh, size=65536):
180210

181211
well_row_id = record.get("{%s}Row" % ns["bts"])
182212
well_col_id = record.get("{%s}Column" % ns["bts"])
183-
well_id = chr(64+int(well_row_id)) + str(well_col_id).zfill(2)
213+
well_id = chr(64 + int(well_row_id)) + str(well_col_id).zfill(2)
184214

185-
x_pixel = np.nan
186-
y_pixel = np.nan
187215
bit_depth = np.nan
188216
width = np.nan
189217
height = np.nan
190218
camera_no = np.nan
191219
pixel_size_x = np.nan
192220
pixel_size_y = np.nan
193-
if rec_type == 'IMG':
221+
if rec_type == "IMG":
194222
mrf_idx = mrf_channel_indices[channel_id]
195223
pixel_size_x = mrf_frame.iat[mrf_idx, 1]
196224
pixel_size_y = mrf_frame.iat[mrf_idx, 2]
@@ -204,69 +232,93 @@ def blocks(fh, size=65536):
204232
mlf_frame.iat[idx, 1] = well_id
205233
mlf_frame.iat[idx, 2] = int(well_col_id)
206234
mlf_frame.iat[idx, 3] = int(well_row_id)
207-
mlf_frame.iat[idx, 4] = int(
208-
record.get("{%s}TimePoint" % ns["bts"])
209-
)
210-
mlf_frame.iat[idx, 5] = \
211-
int(record.get("{%s}FieldIndex" % ns["bts"]))
235+
mlf_frame.iat[idx, 4] = int(record.get("{%s}TimePoint" % ns["bts"]))
236+
mlf_frame.iat[idx, 5] = int(record.get("{%s}FieldIndex" % ns["bts"]))
212237
mlf_frame.iat[idx, 6] = int(record.get("{%s}ZIndex" % ns["bts"]))
213-
mlf_frame.iat[idx, 7] = \
214-
int(record.get("{%s}TimelineIndex" % ns["bts"]))
215-
mlf_frame.iat[idx, 8] = \
216-
int(record.get("{%s}ActionIndex" % ns["bts"]))
238+
mlf_frame.iat[idx, 7] = int(
239+
record.get("{%s}TimelineIndex" % ns["bts"])
240+
)
241+
mlf_frame.iat[idx, 8] = int(record.get("{%s}ActionIndex" % ns["bts"]))
217242
mlf_frame.iat[idx, 9] = record.get("{%s}Action" % ns["bts"])
218243
mlf_frame.iat[idx, 10] = x_micrometer
219244
mlf_frame.iat[idx, 11] = y_micrometer
220245
mlf_frame.iat[idx, 12] = z_micrometer
221-
222-
mlf_frame.iat[idx, 13] = pixel_size_x
223-
mlf_frame.iat[idx, 14] = pixel_size_y
224-
225-
mlf_frame.iat[idx, 15] = bit_depth
226-
mlf_frame.iat[idx, 16] = width
227-
mlf_frame.iat[idx, 17] = height
228-
mlf_frame.iat[idx, 18] = channel_id
229-
mlf_frame.iat[idx, 19] = camera_no
230-
mlf_frame.iat[idx, 20] = record.text # file_name
246+
mlf_frame.iat[idx, 13] = width
247+
mlf_frame.iat[idx, 14] = height
248+
mlf_frame.iat[idx, 15] = pixel_size_x
249+
mlf_frame.iat[idx, 16] = pixel_size_y
250+
251+
mlf_frame.iat[idx, 17] = bit_depth
252+
mlf_frame.iat[idx, 18] = width
253+
mlf_frame.iat[idx, 19] = height
254+
mlf_frame.iat[idx, 20] = channel_id
255+
mlf_frame.iat[idx, 21] = camera_no
256+
mlf_frame.iat[idx, 22] = record.text # file_name
231257

232258
mlf_frame = mlf_frame.dropna(thresh=(len(mlf_frame.columns)))
233259
return mlf_frame, error_count
234260

235261

236262
def calculate_steps(site_series: pd.Series):
237-
# site_series is the z_micrometer series for a given site of a given channel
238-
# This function calculates the step size in Z
263+
# site_series is the z_micrometer series for a given site of a given
264+
# channel. This function calculates the step size in Z
239265
steps = site_series.diff()[1:]
240-
assert steps.std().sum() == 0.0, "" \
241-
"When parsing the Yokogawa mlf file, some sites " \
242-
"had varying step size in Z. " \
266+
if not steps.std().sum() == 0.0:
267+
raise Exception(
268+
"When parsing the Yokogawa mlf file, some sites "
269+
"had varying step size in Z. "
243270
"That is not supported for the OME-Zarr parsing"
271+
)
244272
return steps.mean()
245273

246274

247275
def get_z_steps(mlf_frame):
248-
# Process mlf_frame to extract Z information
249-
# run checks on consistencies & return site-based z step dataframe
276+
# Process mlf_frame to extract Z information (pixel size & steps).
277+
# Run checks on consistencies & return site-based z step dataframe
250278
# Group by well, field & channel
251-
grouped_sites_z = mlf_frame.loc[:, [
252-
'well_id',
253-
'field_id',
254-
'action_id',
255-
'channel_id',
256-
'z_micrometer']].groupby(by = [
257-
'well_id',
258-
'field_id',
259-
'action_id',
260-
'channel_id'])
279+
grouped_sites_z = mlf_frame.loc[
280+
:, ["well_id", "field_id", "action_id", "channel_id", "z_micrometer"]
281+
].groupby(by=["well_id", "field_id", "action_id", "channel_id"])
261282
# Group the whole site (combine channels), because Z steps need to be
262283
# consistent between channels for OME-Zarr.
263-
z_data = grouped_sites_z.apply(calculate_steps).groupby(['well_id', 'field_id'])
264-
assert z_data.std().sum().sum() == 0.0, "" \
265-
"When parsing the Yokogawa mlf file, channels had " \
266-
"varying step size in Z. " \
284+
z_data = grouped_sites_z.apply(calculate_steps).groupby(
285+
["well_id", "field_id"]
286+
)
287+
if not z_data.std().sum().sum() == 0.0:
288+
raise Exception(
289+
"When parsing the Yokogawa mlf file, channels had "
290+
"varying step size in Z. "
267291
"That is not supported for the OME-Zarr parsing"
268-
z_frame = z_data.mean()
269-
z_frame.columns = ['pixel_size_z']
292+
)
293+
294+
# Ensure that channels have the same number of z planes and
295+
# reduce it to one value.
296+
# Only check if there is more than one channel available
297+
if any(
298+
grouped_sites_z.count().groupby(["well_id", "field_id"]).count() > 1
299+
):
300+
if any(
301+
grouped_sites_z.count()
302+
.groupby(["well_id", "field_id"])
303+
.std()
304+
.sum()
305+
!= 0
306+
):
307+
raise Exception(
308+
"When parsing the Yokogawa mlf file, channels had "
309+
"varying number of z planes."
310+
"That is not supported for the OME-Zarr parsing"
311+
)
312+
z_steps = (
313+
grouped_sites_z.count()
314+
.groupby(["well_id", "field_id"])
315+
.mean()
316+
.astype(int)
317+
)
318+
319+
# Combine the two dataframes
320+
z_frame = pd.concat([z_data.mean(), z_steps], axis=1)
321+
z_frame.columns = ["pixel_size_z", "z_pixel"]
270322
return z_frame
271323

272324

@@ -275,7 +327,10 @@ def check_grouped_sites_consistency(grouped_sites, per_site_parameters):
275327
# Same for pixel sizes
276328
# Only relevant when a site has multiple entries
277329
if grouped_sites.count().min().min() > 1:
278-
assert grouped_sites.std().sum().sum() == 0.0, "" \
279-
"When parsing the Yokogawa MeasurementData.mlf file, " \
280-
f"some of the parameters {per_site_parameters} varied within " \
330+
if not grouped_sites.std().sum().sum() == 0.0:
331+
raise Exception(
332+
""
333+
"When parsing the Yokogawa MeasurementData.mlf file, "
334+
f"some of the parameters {per_site_parameters} varied within "
281335
"the site. That is not supported for the OME-Zarr parsing"
336+
)

0 commit comments

Comments
 (0)