Skip to content
This repository was archived by the owner on Jun 27, 2023. It is now read-only.

Commit 20d951f

Browse files
feat: hamt enumlinks custom (#111)
* feat: use custom dag traversal for HAMT link enumeration * fix comments in completehamt_test.go Co-authored-by: Lucas Molas <[email protected]>
1 parent 83ad983 commit 20d951f

File tree

5 files changed

+236
-54
lines changed

5 files changed

+236
-54
lines changed

go.mod

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,13 @@ require (
66
github.com/gopherjs/gopherjs v0.0.0-20190430165422-3e4dfb77656c // indirect
77
github.com/ipfs/go-bitfield v1.0.0
88
github.com/ipfs/go-bitswap v0.1.2 // indirect
9+
github.com/ipfs/go-block-format v0.0.2
10+
github.com/ipfs/go-blockservice v0.1.0
911
github.com/ipfs/go-cid v0.0.7
12+
github.com/ipfs/go-datastore v0.0.5
13+
github.com/ipfs/go-ipfs-blockstore v0.0.1
1014
github.com/ipfs/go-ipfs-chunker v0.0.1
15+
github.com/ipfs/go-ipfs-exchange-offline v0.0.1
1116
github.com/ipfs/go-ipfs-files v0.0.3
1217
github.com/ipfs/go-ipfs-posinfo v0.0.1
1318
github.com/ipfs/go-ipfs-util v0.0.1
@@ -21,6 +26,7 @@ require (
2126
github.com/spaolacci/murmur3 v1.1.0
2227
github.com/stretchr/testify v1.7.0
2328
github.com/warpfork/go-wish v0.0.0-20190328234359-8b3e70f8e830 // indirect
29+
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9
2430
)
2531

2632
go 1.16

go.sum

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,7 @@ golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwY
335335
golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
336336
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
337337
golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
338+
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9 h1:SQFwaSi55rU7vdNs9Yr0Z324VNlrF+0wMqRXT4St8ck=
338339
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
339340
golang.org/x/sys v0.0.0-20180909124046-d0be0721c37e/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
340341
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=

hamt/hamt.go

Lines changed: 167 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ import (
2424
"context"
2525
"fmt"
2626
"os"
27+
"sync"
28+
29+
"golang.org/x/sync/errgroup"
2730

2831
format "github.com/ipfs/go-unixfs"
2932
"github.com/ipfs/go-unixfs/internal"
@@ -372,59 +375,190 @@ func (ds *Shard) EnumLinksAsync(ctx context.Context) <-chan format.LinkResult {
372375
go func() {
373376
defer close(linkResults)
374377
defer cancel()
375-
getLinks := makeAsyncTrieGetLinks(ds.dserv, linkResults)
376-
cset := cid.NewSet()
377-
rootNode, err := ds.Node()
378-
if err != nil {
379-
emitResult(ctx, linkResults, format.LinkResult{Link: nil, Err: err})
380-
return
381-
}
382-
err = dag.Walk(ctx, getLinks, rootNode.Cid(), cset.Visit, dag.Concurrent())
378+
379+
err := parallelShardWalk(ctx, ds, ds.dserv, func(formattedLink *ipld.Link) error {
380+
emitResult(ctx, linkResults, format.LinkResult{Link: formattedLink, Err: nil})
381+
return nil
382+
})
383383
if err != nil {
384384
emitResult(ctx, linkResults, format.LinkResult{Link: nil, Err: err})
385385
}
386386
}()
387387
return linkResults
388388
}
389389

390-
// makeAsyncTrieGetLinks builds a getLinks function that can be used with EnumerateChildrenAsync
391-
// to iterate a HAMT shard. It takes an IPLD Dag Service to fetch nodes, and a call back that will get called
392-
// on all links to leaf nodes in a HAMT tree, so they can be collected for an EnumLinks operation
393-
func makeAsyncTrieGetLinks(dagService ipld.DAGService, linkResults chan<- format.LinkResult) dag.GetLinks {
394-
395-
return func(ctx context.Context, currentCid cid.Cid) ([]*ipld.Link, error) {
396-
node, err := dagService.Get(ctx, currentCid)
397-
if err != nil {
398-
return nil, err
399-
}
400-
directoryShard, err := NewHamtFromDag(dagService, node)
401-
if err != nil {
402-
return nil, err
403-
}
390+
type listCidsAndShards struct {
391+
cids []cid.Cid
392+
shards []*Shard
393+
}
404394

405-
childShards := make([]*ipld.Link, 0, directoryShard.childer.length())
406-
links := directoryShard.childer.links
407-
for idx := range directoryShard.childer.children {
408-
lnk := links[idx]
409-
lnkLinkType, err := directoryShard.childLinkType(lnk)
395+
func (ds *Shard) walkChildren(processLinkValues func(formattedLink *ipld.Link) error) (*listCidsAndShards, error) {
396+
res := &listCidsAndShards{}
410397

398+
for idx, lnk := range ds.childer.links {
399+
if nextShard := ds.childer.children[idx]; nextShard == nil {
400+
lnkLinkType, err := ds.childLinkType(lnk)
411401
if err != nil {
412402
return nil, err
413403
}
414-
if lnkLinkType == shardLink {
415-
childShards = append(childShards, lnk)
416-
} else {
417-
sv, err := directoryShard.makeShardValue(lnk)
404+
405+
switch lnkLinkType {
406+
case shardValueLink:
407+
sv, err := ds.makeShardValue(lnk)
418408
if err != nil {
419409
return nil, err
420410
}
421411
formattedLink := sv.val
422412
formattedLink.Name = sv.key
423-
emitResult(ctx, linkResults, format.LinkResult{Link: formattedLink, Err: nil})
413+
414+
if err := processLinkValues(formattedLink); err != nil {
415+
return nil, err
416+
}
417+
case shardLink:
418+
res.cids = append(res.cids, lnk.Cid)
419+
default:
420+
return nil, fmt.Errorf("unsupported shard link type")
421+
}
422+
423+
} else {
424+
if nextShard.val != nil {
425+
formattedLink := &ipld.Link{
426+
Name: nextShard.key,
427+
Size: nextShard.val.Size,
428+
Cid: nextShard.val.Cid,
429+
}
430+
if err := processLinkValues(formattedLink); err != nil {
431+
return nil, err
432+
}
433+
} else {
434+
res.shards = append(res.shards, nextShard)
435+
}
436+
}
437+
}
438+
return res, nil
439+
}
440+
441+
// parallelShardWalk is quite similar to the DAG walking algorithm from https://github.com/ipfs/go-merkledag/blob/594e515f162e764183243b72c2ba84f743424c8c/merkledag.go#L464
442+
// However, there are a few notable differences:
443+
// 1. Some children are actualized Shard structs and some are in the blockstore, this will leverage walking over the in memory Shards as well as the stored blocks
444+
// 2. Instead of just passing each child into the worker pool by itself we group them so that we can leverage optimizations from GetMany.
445+
// This optimization also makes the walk a little more biased towards depth (as opposed to BFS) in the earlier part of the DAG.
446+
// This is particularly helpful for operations like estimating the directory size which should complete quickly when possible.
447+
// 3. None of the extra options from that package are needed
448+
func parallelShardWalk(ctx context.Context, root *Shard, dserv ipld.DAGService, processShardValues func(formattedLink *ipld.Link) error) error {
449+
const concurrency = 32
450+
451+
var visitlk sync.Mutex
452+
visitSet := cid.NewSet()
453+
visit := visitSet.Visit
454+
455+
// Setup synchronization
456+
grp, errGrpCtx := errgroup.WithContext(ctx)
457+
458+
// Input and output queues for workers.
459+
feed := make(chan *listCidsAndShards)
460+
out := make(chan *listCidsAndShards)
461+
done := make(chan struct{})
462+
463+
for i := 0; i < concurrency; i++ {
464+
grp.Go(func() error {
465+
for feedChildren := range feed {
466+
for _, nextShard := range feedChildren.shards {
467+
nextChildren, err := nextShard.walkChildren(processShardValues)
468+
if err != nil {
469+
return err
470+
}
471+
472+
select {
473+
case out <- nextChildren:
474+
case <-errGrpCtx.Done():
475+
return nil
476+
}
477+
}
478+
479+
var linksToVisit []cid.Cid
480+
for _, nextCid := range feedChildren.cids {
481+
var shouldVisit bool
482+
483+
visitlk.Lock()
484+
shouldVisit = visit(nextCid)
485+
visitlk.Unlock()
486+
487+
if shouldVisit {
488+
linksToVisit = append(linksToVisit, nextCid)
489+
}
490+
}
491+
492+
chNodes := dserv.GetMany(errGrpCtx, linksToVisit)
493+
for optNode := range chNodes {
494+
if optNode.Err != nil {
495+
return optNode.Err
496+
}
497+
498+
nextShard, err := NewHamtFromDag(dserv, optNode.Node)
499+
if err != nil {
500+
return err
501+
}
502+
503+
nextChildren, err := nextShard.walkChildren(processShardValues)
504+
if err != nil {
505+
return err
506+
}
507+
508+
select {
509+
case out <- nextChildren:
510+
case <-errGrpCtx.Done():
511+
return nil
512+
}
513+
}
514+
515+
select {
516+
case done <- struct{}{}:
517+
case <-errGrpCtx.Done():
518+
}
519+
}
520+
return nil
521+
})
522+
}
523+
524+
send := feed
525+
var todoQueue []*listCidsAndShards
526+
var inProgress int
527+
528+
next := &listCidsAndShards{
529+
shards: []*Shard{root},
530+
}
531+
532+
dispatcherLoop:
533+
for {
534+
select {
535+
case send <- next:
536+
inProgress++
537+
if len(todoQueue) > 0 {
538+
next = todoQueue[0]
539+
todoQueue = todoQueue[1:]
540+
} else {
541+
next = nil
542+
send = nil
543+
}
544+
case <-done:
545+
inProgress--
546+
if inProgress == 0 && next == nil {
547+
break dispatcherLoop
548+
}
549+
case nextNodes := <-out:
550+
if next == nil {
551+
next = nextNodes
552+
send = feed
553+
} else {
554+
todoQueue = append(todoQueue, nextNodes)
424555
}
556+
case <-errGrpCtx.Done():
557+
break dispatcherLoop
425558
}
426-
return childShards, nil
427559
}
560+
close(feed)
561+
return grp.Wait()
428562
}
429563

430564
func emitResult(ctx context.Context, linkResults chan<- format.LinkResult, r format.LinkResult) {

io/completehamt_test.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ import (
2121
// * all leaf Shard nodes have the same depth (and have only 'value' links).
2222
// * all internal Shard nodes point only to other Shards (and hence have zero 'value' links).
2323
// * the total number of 'value' links (directory entries) is:
24-
// io.DefaultShardWidth ^ (treeHeight + 1).
24+
// childsPerNode ^ (treeHeight).
25+
// treeHeight: The number of layers of non-value HAMT nodes (e.g. height = 1 is a single shard pointing to some values)
2526
// FIXME: HAMTHashFunction needs to be set to idHash by the caller. We depend on
2627
// this simplification for the current logic to work. (HAMTHashFunction is a
2728
// global setting of the package, it is hard-coded in the serialized Shard node

0 commit comments

Comments
 (0)