-
Notifications
You must be signed in to change notification settings - Fork 254
/
Copy pathWhitespaceCoverExtractor.cs
386 lines (334 loc) · 16.7 KB
/
WhitespaceCoverExtractor.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
namespace UglyToad.PdfPig.DocumentLayoutAnalysis
{
using Content;
using Core;
using Geometry;
using System;
using System.Collections.Generic;
using System.Linq;
/// <summary>
/// A top-down algorithm that finds a cover of the background whitespace of a document in terms of maximal empty rectangles.
/// <para>See Section 3.2 of 'High precision text extraction from PDF documents' by Øyvind Raddum Berg and Section 2 of 'Two geometric algorithms for layout analysis' by Thomas M. Breuel.</para>
/// </summary>
public static class WhitespaceCoverExtractor
{
/// <summary>
/// Gets the cover of the background whitespace of a page in terms of maximal empty rectangles.
/// </summary>
/// <param name="words">The words in the page.</param>
/// <param name="images">The images in the page.</param>
/// <param name="maxRectangleCount">The maximum number of rectangles to find.</param>
/// <param name="maxBoundQueueSize">The maximum size of the queue used in the algorithm.</param>
/// <returns>The identified whitespace rectangles.</returns>
public static IReadOnlyList<PdfRectangle> GetWhitespaces(IEnumerable<Word> words, IEnumerable<IPdfImage> images = null, int maxRectangleCount = 40, int maxBoundQueueSize = 0)
{
return GetWhitespaces(words,
images,
words.SelectMany(w => w.Letters).Select(x => x.GlyphRectangle.Width).Mode() * 1.25,
words.SelectMany(w => w.Letters).Select(x => x.GlyphRectangle.Height).Mode() * 1.25,
maxRectangleCount: maxRectangleCount,
maxBoundQueueSize: maxBoundQueueSize);
}
/// <summary>
/// Gets the cover of the background whitespace of a page in terms of maximal empty rectangles.
/// </summary>
/// <param name="words">The words in the page.</param>
/// <param name="images">The images in the page.</param>
/// <param name="minWidth">Lower bounds for the width of rectangles.</param>
/// <param name="minHeight">Lower bounds for the height of rectangles.</param>
/// <param name="maxRectangleCount">The maximum number of rectangles to find.</param>
/// <param name="whitespaceFuzziness">Constant value to allow candidate whitespace rectangle to overlap the
/// surrounding obstacles by some percent. Default value is 15%.</param>
/// <param name="maxBoundQueueSize">The maximum size of the queue used in the algorithm.</param>
/// <returns>The identified whitespace rectangles.</returns>
public static IReadOnlyList<PdfRectangle> GetWhitespaces(IEnumerable<Word> words, IEnumerable<IPdfImage> images,
double minWidth, double minHeight, int maxRectangleCount = 40, double whitespaceFuzziness = 0.15, int maxBoundQueueSize = 0)
{
var bboxes = words.Where(w => w.BoundingBox.Width > 0 && w.BoundingBox.Height > 0)
.Select(o => o.BoundingBox).ToList();
if (images?.Any() == true)
{
bboxes.AddRange(images.Where(w => w.Bounds.Width > 0 && w.Bounds.Height > 0).Select(o => o.Bounds));
}
return GetWhitespaces(bboxes,
minWidth: minWidth,
minHeight: minHeight,
maxRectangleCount: maxRectangleCount,
whitespaceFuzziness: whitespaceFuzziness,
maxBoundQueueSize: maxBoundQueueSize);
}
/// <summary>
/// Gets the cover of the background whitespace of a page in terms of maximal empty rectangles.
/// </summary>
/// <param name="boundingboxes">The list of obstacles' bounding boxes in the page.</param>
/// <param name="minWidth">Lower bounds for the width of rectangles.</param>
/// <param name="minHeight">Lower bounds for the height of rectangles.</param>
/// <param name="maxRectangleCount">The maximum number of rectangles to find.</param>
/// <param name="whitespaceFuzziness">Constant value to allow candidate whitespace rectangle to overlap the
/// surrounding obstacles by some percent. Default value is 15%.</param>
/// <param name="maxBoundQueueSize">The maximum size of the queue used in the algorithm.</param>
/// <returns>The identified whitespace rectangles.</returns>
public static IReadOnlyList<PdfRectangle> GetWhitespaces(IEnumerable<PdfRectangle> boundingboxes,
double minWidth, double minHeight, int maxRectangleCount = 40, double whitespaceFuzziness = 0.15, int maxBoundQueueSize = 0)
{
if (!boundingboxes.Any())
{
return Array.Empty<PdfRectangle>();
}
var obstacles = new HashSet<PdfRectangle>(boundingboxes);
var pageBound = GetBound(obstacles);
return GetMaximalRectangles(pageBound,
obstacles,
minWidth: minWidth,
minHeight: minHeight,
maxRectangleCount: maxRectangleCount,
whitespaceFuzziness: whitespaceFuzziness,
maxBoundQueueSize: maxBoundQueueSize);
}
private static IReadOnlyList<PdfRectangle> GetMaximalRectangles(PdfRectangle bound,
HashSet<PdfRectangle> obstacles, double minWidth, double minHeight, int maxRectangleCount,
double whitespaceFuzziness, int maxBoundQueueSize)
{
var queueEntries = new QueueEntries(maxBoundQueueSize);
queueEntries.Enqueue(new QueueEntry(bound, obstacles, whitespaceFuzziness));
var selected = new HashSet<PdfRectangle>();
var holdList = new HashSet<QueueEntry>();
while (queueEntries.Any())
{
var current = queueEntries.Dequeue();
if (current.IsEmptyEnough(obstacles))
{
if (selected.Any(c => Inside(c, current.Bound)))
{
continue;
}
// A check was added which impeded the algorithm from accepting
// rectangles which were not adjacent to an already accepted
// rectangle, or to the border of the page.
if (!IsAdjacentToPageBounds(bound, current.Bound) && // NOT in contact to border page AND
!selected.Any(q => IsAdjacentTo(q, current.Bound))) // NOT in contact to any already accepted rectangle
{
// In order to maintain the correctness of the algorithm,
// rejected rectangles are put in a hold list.
holdList.Add(current);
continue;
}
selected.Add(current.Bound);
if (selected.Count >= maxRectangleCount)
{
return selected.ToList();
}
obstacles.Add(current.Bound);
// Each time a new rectangle is identified and accepted, this hold list
// will be added back to the queue in case any of them will have become valid.
foreach (var hold in holdList)
{
queueEntries.Enqueue(hold);
}
// After a maximal rectangle has been found, it is added back to the list
// of obstacles. Whenever a QueueEntry is dequeued, its list of obstacles
// can be recomputed to include newly identified whitespace rectangles.
foreach (var overlapping in queueEntries)
{
if (OverlapsHard(current.Bound, overlapping.Bound))
{
overlapping.AddWhitespace(current.Bound);
}
}
continue;
}
var pivot = current.GetPivot();
var b = current.Bound;
var subRectangles = new List<PdfRectangle>();
var rRight = new PdfRectangle(pivot.Right, b.Bottom, b.Right, b.Top);
if (b.Right > pivot.Right && rRight.Height > minHeight && rRight.Width > minWidth)
{
queueEntries.Enqueue(new QueueEntry(rRight,
new HashSet<PdfRectangle>(current.Obstacles.Where(o => OverlapsHard(rRight, o))),
whitespaceFuzziness));
}
var rLeft = new PdfRectangle(b.Left, b.Bottom, pivot.Left, b.Top);
if (b.Left < pivot.Left && rLeft.Height > minHeight && rLeft.Width > minWidth)
{
queueEntries.Enqueue(new QueueEntry(rLeft,
new HashSet<PdfRectangle>(current.Obstacles.Where(o => OverlapsHard(rLeft, o))),
whitespaceFuzziness));
}
var rAbove = new PdfRectangle(b.Left, b.Bottom, b.Right, pivot.Bottom);
if (b.Bottom < pivot.Bottom && rAbove.Height > minHeight && rAbove.Width > minWidth)
{
queueEntries.Enqueue(new QueueEntry(rAbove,
new HashSet<PdfRectangle>(current.Obstacles.Where(o => OverlapsHard(rAbove, o))),
whitespaceFuzziness));
}
var rBelow = new PdfRectangle(b.Left, pivot.Top, b.Right, b.Top);
if (b.Top > pivot.Top && rBelow.Height > minHeight && rBelow.Width > minWidth)
{
queueEntries.Enqueue(new QueueEntry(rBelow,
new HashSet<PdfRectangle>(current.Obstacles.Where(o => OverlapsHard(rBelow, o))),
whitespaceFuzziness));
}
}
return selected.ToList();
}
private static bool IsAdjacentTo(PdfRectangle rectangle1, PdfRectangle rectangle2)
{
if (rectangle1.Left > rectangle2.Right ||
rectangle2.Left > rectangle1.Right ||
rectangle1.Top < rectangle2.Bottom ||
rectangle2.Top < rectangle1.Bottom)
{
return false;
}
return rectangle1.Left == rectangle2.Right ||
rectangle1.Right == rectangle2.Left ||
rectangle1.Bottom == rectangle2.Top ||
rectangle1.Top == rectangle2.Bottom;
}
private static bool IsAdjacentToPageBounds(PdfRectangle pageBound, PdfRectangle rectangle)
{
return rectangle.Bottom == pageBound.Bottom ||
rectangle.Top == pageBound.Top ||
rectangle.Left == pageBound.Left ||
rectangle.Right == pageBound.Right;
}
private static bool OverlapsHard(PdfRectangle rectangle1, PdfRectangle rectangle2)
{
return rectangle1.Left < rectangle2.Right &&
rectangle2.Left < rectangle1.Right &&
rectangle1.Top > rectangle2.Bottom &&
rectangle2.Top > rectangle1.Bottom;
}
private static bool Inside(PdfRectangle rectangle1, PdfRectangle rectangle2)
{
return rectangle2.Right <= rectangle1.Right && rectangle2.Left >= rectangle1.Left &&
rectangle2.Top <= rectangle1.Top && rectangle2.Bottom >= rectangle1.Bottom;
}
private static PdfRectangle GetBound(IEnumerable<PdfRectangle> obstacles)
{
return new PdfRectangle(
obstacles.Min(b => b.Left),
obstacles.Min(b => b.Bottom),
obstacles.Max(b => b.Right),
obstacles.Max(b => b.Top));
}
#region Sorted Queue
private class QueueEntries : SortedSet<QueueEntry>
{
private readonly int bound;
public QueueEntries(int maximumBound)
{
bound = maximumBound;
}
public QueueEntry Dequeue()
{
var current = Max;
Remove(current);
return current;
}
public void Enqueue(QueueEntry queueEntry)
{
if (bound > 0 && Count > bound)
{
Remove(Min);
}
Add(queueEntry);
}
}
private class QueueEntry : IComparable<QueueEntry>
{
private readonly double quality;
private readonly double whitespaceFuzziness;
public PdfRectangle Bound { get; }
public HashSet<PdfRectangle> Obstacles { get; }
public QueueEntry(PdfRectangle bound, HashSet<PdfRectangle> obstacles, double whitespaceFuzziness)
{
Bound = bound;
quality = ScoringFunction(Bound);
Obstacles = obstacles;
this.whitespaceFuzziness = whitespaceFuzziness;
}
public PdfRectangle GetPivot()
{
int indexMiddle = Distances.FindIndexNearest(Bound.Centroid,
Obstacles.Select(o => o.Centroid).ToList(),
p => p, p => p, Distances.Euclidean, out double d);
return indexMiddle == -1 ? Obstacles.First() : Obstacles.ElementAt(indexMiddle);
}
public bool IsEmptyEnough()
{
return Obstacles.Count == 0;
}
public bool IsEmptyEnough(IEnumerable<PdfRectangle> pageObstacles)
{
if (IsEmptyEnough())
{
return true;
}
double sum = 0;
foreach (var obstacle in pageObstacles)
{
var intersect = Bound.Intersect(obstacle);
if (!intersect.HasValue)
{
return false;
}
double minimumArea = MinimumOverlappingArea(obstacle, Bound, whitespaceFuzziness);
if (intersect.Value.Area > minimumArea)
{
return false;
}
sum += intersect.Value.Area;
}
return sum < Bound.Area * whitespaceFuzziness;
}
public override string ToString()
{
return "Q=" + quality.ToString("#0.0") + ", O=" + Obstacles.Count + ", " + Bound.ToString();
}
public void AddWhitespace(PdfRectangle rectangle)
{
Obstacles.Add(rectangle);
}
public int CompareTo(QueueEntry entry)
{
return quality.CompareTo(entry.quality);
}
public override bool Equals(object obj)
{
if (obj is QueueEntry entry)
{
return Bound.Left == entry.Bound.Left &&
Bound.Right == entry.Bound.Right &&
Bound.Top == entry.Bound.Top &&
Bound.Bottom == entry.Bound.Bottom &&
Obstacles == entry.Obstacles;
}
return false;
}
public override int GetHashCode()
{
return (Bound.Left, Bound.Right,
Bound.Top, Bound.Bottom,
Obstacles).GetHashCode();
}
private static double MinimumOverlappingArea(PdfRectangle r1, PdfRectangle r2, double whitespaceFuzziness)
{
return Math.Min(r1.Area, r2.Area) * whitespaceFuzziness;
}
/// <summary>
/// The scoring function Q(r) which is subsequently used to sort a priority queue.
/// </summary>
/// <param name="rectangle"></param>
private static double ScoringFunction(PdfRectangle rectangle)
{
// As can be seen, tall rectangles are preferred. The trick while choosing this Q(r) was
// to keep that preference while still allowing wide rectangles to be chosen. After having
// experimented with quite a few variations, this simple function was considered a good
// solution.
return rectangle.Area * (rectangle.Height / 4.0);
}
}
#endregion
}
}