|
| 1 | +-- | Parallel versions of 'filter' and 'simpleFilter' |
| 2 | +module Text.Fuzzy.Parallel |
| 3 | +( filter, |
| 4 | + simpleFilter, |
| 5 | + -- reexports |
| 6 | + Fuzzy(..), |
| 7 | + match |
| 8 | +) where |
| 9 | + |
| 10 | +import Control.Monad.ST (runST) |
| 11 | +import Control.Parallel.Strategies (Eval, Strategy, evalTraversable, |
| 12 | + parTraversable, rseq, using) |
| 13 | +import Data.Function (on) |
| 14 | +import Data.Monoid.Textual (TextualMonoid) |
| 15 | +import Data.Ord (Down (Down)) |
| 16 | +import Data.Vector (Vector, (!)) |
| 17 | +import qualified Data.Vector as V |
| 18 | +-- need to use a stable sort |
| 19 | +import Data.Bifunctor (second) |
| 20 | +import qualified Data.Vector.Algorithms.Tim as VA |
| 21 | +import Prelude hiding (filter) |
| 22 | +import Text.Fuzzy (Fuzzy (..), match) |
| 23 | + |
| 24 | +-- | The function to filter a list of values by fuzzy search on the text extracted from them. |
| 25 | +-- |
| 26 | +-- >>> length $ filter 1000 200 "ML" (concat $ replicate 10000 [("Standard ML", 1990),("OCaml",1996),("Scala",2003)]) "<" ">" fst False |
| 27 | +-- 200 |
| 28 | +filter :: (TextualMonoid s) |
| 29 | + => Int -- ^ Chunk size. 1000 works well. |
| 30 | + -> s -- ^ Pattern. |
| 31 | + -> [t] -- ^ The list of values containing the text to search in. |
| 32 | + -> s -- ^ The text to add before each match. |
| 33 | + -> s -- ^ The text to add after each match. |
| 34 | + -> (t -> s) -- ^ The function to extract the text from the container. |
| 35 | + -> Bool -- ^ Case sensitivity. |
| 36 | + -> [Fuzzy t s] -- ^ The list of results, sorted, highest score first. |
| 37 | +filter chunkSize pattern ts pre post extract caseSen = runST $ do |
| 38 | + let v = (V.mapMaybe id |
| 39 | + (V.map (\t -> match pattern t pre post extract caseSen) (V.fromList ts) |
| 40 | + `using` |
| 41 | + parVectorChunk chunkSize (evalTraversable forceScore))) |
| 42 | + v' <- V.unsafeThaw v |
| 43 | + VA.sortBy (compare `on` (Down . score)) v' |
| 44 | + v'' <- V.unsafeFreeze v' |
| 45 | + return $ V.toList v'' |
| 46 | + |
| 47 | +-- | Return all elements of the list that have a fuzzy |
| 48 | +-- match against the pattern. Runs with default settings where |
| 49 | +-- nothing is added around the matches, as case insensitive. |
| 50 | +-- |
| 51 | +-- >>> simpleFilter "vm" ["vim", "emacs", "virtual machine"] |
| 52 | +-- ["vim","virtual machine"] |
| 53 | +{-# INLINABLE simpleFilter #-} |
| 54 | +simpleFilter :: (TextualMonoid s) |
| 55 | + => Int -- ^ Chunk size. 1000 works well. |
| 56 | + -> s -- ^ Pattern to look for. |
| 57 | + -> [s] -- ^ List of texts to check. |
| 58 | + -> [s] -- ^ The ones that match. |
| 59 | +simpleFilter chunk pattern xs = |
| 60 | + map original $ filter chunk pattern xs mempty mempty id False |
| 61 | + |
| 62 | +-------------------------------------------------------------------------------- |
| 63 | + |
| 64 | +-- | Evaluation that forces the 'score' field |
| 65 | +forceScore :: TextualMonoid s => Fuzzy t s -> Eval(Fuzzy t s) |
| 66 | +forceScore it@Fuzzy{score} = do |
| 67 | + score' <- rseq score |
| 68 | + return it{score = score'} |
| 69 | + |
| 70 | +-------------------------------------------------------------------------------- |
| 71 | + |
| 72 | +-- | Divides a vector in chunks, applies the strategy in parallel to each chunk. |
| 73 | +parVectorChunk :: Int -> Strategy a -> Vector a -> Eval (Vector a) |
| 74 | +parVectorChunk chunkSize st v = |
| 75 | + V.concat <$> parTraversable (evalTraversable st) (chunkVector chunkSize v) |
| 76 | + |
| 77 | +-- >>> chunkVector 3 (V.fromList [0..10]) |
| 78 | +-- >>> chunkVector 3 (V.fromList [0..11]) |
| 79 | +-- >>> chunkVector 3 (V.fromList [0..12]) |
| 80 | +-- [[0,1,2],[3,4,5],[6,7,8],[9,10]] |
| 81 | +-- [[0,1,2],[3,4,5],[6,7,8],[9,10,11]] |
| 82 | +-- [[0,1,2],[3,4,5],[6,7,8],[9,10,11],[12]] |
| 83 | +chunkVector :: Int -> Vector a -> [Vector a] |
| 84 | +chunkVector chunkSize v = do |
| 85 | + let indices = chunkIndices chunkSize (0,l) |
| 86 | + l = V.length v |
| 87 | + [V.fromListN (h-l+1) [v ! j | j <- [l .. h]] |
| 88 | + | (l,h) <- indices] |
| 89 | + |
| 90 | +-- >>> chunkIndices 3 (0,9) |
| 91 | +-- >>> chunkIndices 3 (0,10) |
| 92 | +-- >>> chunkIndices 3 (0,11) |
| 93 | +-- [(0,2),(3,5),(6,8)] |
| 94 | +-- [(0,2),(3,5),(6,8),(9,9)] |
| 95 | +-- [(0,2),(3,5),(6,8),(9,10)] |
| 96 | +chunkIndices :: Int -> (Int,Int) -> [(Int,Int)] |
| 97 | +chunkIndices chunkSize (from,to) = |
| 98 | + map (second pred) $ |
| 99 | + pairwise $ |
| 100 | + [from, from+chunkSize .. to-1] ++ [to] |
| 101 | + |
| 102 | +pairwise :: [a] -> [(a,a)] |
| 103 | +pairwise [] = [] |
| 104 | +pairwise [_] = [] |
| 105 | +pairwise (x:y:xs) = (x,y) : pairwise (y:xs) |
0 commit comments