This repository was archived by the owner on Jun 27, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 51
feat: hamt enumlinks custom #111
Merged
aschmahmann
merged 10 commits into
schomatis/directory/unsharding
from
feat/hamt-enumlinks-custom
Nov 12, 2021
Merged
Changes from 5 commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
49314cf
feat: use custom dag traversal for HAMT link enumeration
aschmahmann 29ffa00
remove unused code
aschmahmann c930522
cleanup and switch to errgrp
aschmahmann 8051de7
switch to GetMany in EnumLinksAsync
aschmahmann af57e4b
switch sharding threshold test to work on the blockstore rather than …
aschmahmann bec9689
gofmt
schomatis d0faeb3
refactor some names and add more comments
aschmahmann a99e187
test: adjust TestHAMTEnumerationWhenComputingSize to allow for optima…
aschmahmann 2927cdc
fix TestHAMTEnumerationWhenComputingSize optimal size computation
aschmahmann d9a5431
chore: order deps
aschmahmann File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,9 @@ import ( | |
"context" | ||
"fmt" | ||
"os" | ||
"sync" | ||
|
||
"golang.org/x/sync/errgroup" | ||
|
||
format "github.com/ipfs/go-unixfs" | ||
"github.com/ipfs/go-unixfs/internal" | ||
|
@@ -372,59 +375,183 @@ func (ds *Shard) EnumLinksAsync(ctx context.Context) <-chan format.LinkResult { | |
go func() { | ||
defer close(linkResults) | ||
defer cancel() | ||
getLinks := makeAsyncTrieGetLinks(ds.dserv, linkResults) | ||
cset := cid.NewSet() | ||
rootNode, err := ds.Node() | ||
if err != nil { | ||
emitResult(ctx, linkResults, format.LinkResult{Link: nil, Err: err}) | ||
return | ||
} | ||
err = dag.Walk(ctx, getLinks, rootNode.Cid(), cset.Visit, dag.Concurrent()) | ||
|
||
err := parallelWalkDepth(ctx, ds, ds.dserv, func(formattedLink *ipld.Link) error { | ||
emitResult(ctx, linkResults, format.LinkResult{Link: formattedLink, Err: nil}) | ||
return nil | ||
}) | ||
if err != nil { | ||
emitResult(ctx, linkResults, format.LinkResult{Link: nil, Err: err}) | ||
} | ||
}() | ||
return linkResults | ||
} | ||
|
||
// makeAsyncTrieGetLinks builds a getLinks function that can be used with EnumerateChildrenAsync | ||
// to iterate a HAMT shard. It takes an IPLD Dag Service to fetch nodes, and a call back that will get called | ||
// on all links to leaf nodes in a HAMT tree, so they can be collected for an EnumLinks operation | ||
func makeAsyncTrieGetLinks(dagService ipld.DAGService, linkResults chan<- format.LinkResult) dag.GetLinks { | ||
|
||
return func(ctx context.Context, currentCid cid.Cid) ([]*ipld.Link, error) { | ||
node, err := dagService.Get(ctx, currentCid) | ||
if err != nil { | ||
return nil, err | ||
} | ||
directoryShard, err := NewHamtFromDag(dagService, node) | ||
if err != nil { | ||
return nil, err | ||
} | ||
type listCidShardUnion struct { | ||
links []cid.Cid | ||
shards []*Shard | ||
} | ||
|
||
childShards := make([]*ipld.Link, 0, directoryShard.childer.length()) | ||
links := directoryShard.childer.links | ||
for idx := range directoryShard.childer.children { | ||
lnk := links[idx] | ||
lnkLinkType, err := directoryShard.childLinkType(lnk) | ||
func (ds *Shard) walkLinks(processLinkValues func(formattedLink *ipld.Link) error) (*listCidShardUnion, error) { | ||
res := &listCidShardUnion{} | ||
|
||
for idx, lnk := range ds.childer.links { | ||
if nextShard := ds.childer.children[idx]; nextShard == nil { | ||
lnkLinkType, err := ds.childLinkType(lnk) | ||
if err != nil { | ||
return nil, err | ||
} | ||
if lnkLinkType == shardLink { | ||
childShards = append(childShards, lnk) | ||
} else { | ||
sv, err := directoryShard.makeShardValue(lnk) | ||
|
||
switch lnkLinkType { | ||
case shardValueLink: | ||
sv, err := ds.makeShardValue(lnk) | ||
if err != nil { | ||
return nil, err | ||
} | ||
formattedLink := sv.val | ||
formattedLink.Name = sv.key | ||
emitResult(ctx, linkResults, format.LinkResult{Link: formattedLink, Err: nil}) | ||
|
||
if err := processLinkValues(formattedLink); err != nil { | ||
return nil, err | ||
} | ||
case shardLink: | ||
res.links = append(res.links, lnk.Cid) | ||
default: | ||
return nil, fmt.Errorf("unsupported shard link type") | ||
} | ||
|
||
} else { | ||
if nextShard.val != nil { | ||
formattedLink := &ipld.Link{ | ||
Name: nextShard.key, | ||
Size: nextShard.val.Size, | ||
Cid: nextShard.val.Cid, | ||
} | ||
if err := processLinkValues(formattedLink); err != nil { | ||
return nil, err | ||
} | ||
} else { | ||
res.shards = append(res.shards, nextShard) | ||
} | ||
} | ||
} | ||
return res, nil | ||
} | ||
|
||
func parallelWalkDepth(ctx context.Context, root *Shard, dserv ipld.DAGService, processShardValues func(formattedLink *ipld.Link) error) error { | ||
const concurrency = 32 | ||
|
||
var visitlk sync.Mutex | ||
visitSet := cid.NewSet() | ||
visit := visitSet.Visit | ||
|
||
// Setup synchronization | ||
grp, errGrpCtx := errgroup.WithContext(ctx) | ||
|
||
// Input and output queues for workers. | ||
feed := make(chan *listCidShardUnion) | ||
out := make(chan *listCidShardUnion) | ||
done := make(chan struct{}) | ||
|
||
for i := 0; i < concurrency; i++ { | ||
grp.Go(func() error { | ||
for shardOrCID := range feed { | ||
for _, nextShard := range shardOrCID.shards { | ||
nextLinks, err := nextShard.walkLinks(processShardValues) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
select { | ||
case out <- nextLinks: | ||
case <-errGrpCtx.Done(): | ||
return nil | ||
} | ||
} | ||
|
||
var linksToVisit []cid.Cid | ||
for _, nextLink := range shardOrCID.links { | ||
var shouldVisit bool | ||
|
||
visitlk.Lock() | ||
shouldVisit = visit(nextLink) | ||
visitlk.Unlock() | ||
|
||
if shouldVisit { | ||
linksToVisit = append(linksToVisit, nextLink) | ||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I think we could drop this optimization (to simplify the code) as I wouldn't expect to have repeated internal (non-value) shard nodes. |
||
|
||
chNodes := dserv.GetMany(errGrpCtx, linksToVisit) | ||
for optNode := range chNodes { | ||
if optNode.Err != nil { | ||
return optNode.Err | ||
} | ||
|
||
nextShard, err := NewHamtFromDag(dserv, optNode.Node) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
nextLinks, err := nextShard.walkLinks(processShardValues) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: The general nomenclature around
|
||
if err != nil { | ||
return err | ||
} | ||
|
||
select { | ||
case out <- nextLinks: | ||
case <-errGrpCtx.Done(): | ||
return nil | ||
} | ||
} | ||
|
||
select { | ||
case done <- struct{}{}: | ||
case <-errGrpCtx.Done(): | ||
} | ||
} | ||
return nil | ||
}) | ||
} | ||
|
||
send := feed | ||
var todoQueue []*listCidShardUnion | ||
var inProgress int | ||
|
||
next := &listCidShardUnion{ | ||
shards: []*Shard{root}, | ||
} | ||
|
||
dispatcherLoop: | ||
for { | ||
select { | ||
case send <- next: | ||
inProgress++ | ||
if len(todoQueue) > 0 { | ||
next = todoQueue[0] | ||
todoQueue = todoQueue[1:] | ||
} else { | ||
next = nil | ||
send = nil | ||
} | ||
case <-done: | ||
inProgress-- | ||
if inProgress == 0 && next == nil { | ||
break dispatcherLoop | ||
} | ||
case nextNodes := <-out: | ||
if next == nil { | ||
next = nextNodes | ||
send = feed | ||
} else { | ||
todoQueue = append(todoQueue, nextNodes) | ||
} | ||
case <-errGrpCtx.Done(): | ||
break dispatcherLoop | ||
} | ||
return childShards, nil | ||
} | ||
close(feed) | ||
return grp.Wait() | ||
} | ||
|
||
func emitResult(ctx context.Context, linkResults chan<- format.LinkResult, r format.LinkResult) { | ||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: The walk algorithm here is more expansive than the original and its differences should be documented (as this is clearly a copy of the other and anyone reading this code will be thinking of the original when trying to reason through it). (Not referring to the
GetMany
optimization which is valid in itself and could even be incorporated to theShard
logic.)In the original we process one parent node at a time (represented by its CID), extract its children, filter which should be emitted as output (value link/shards), and push the rest to the queue/feed one at a time to be processed independently in the next iteration, each as new parent node.
Here we send (after filtering) all the children together as a bulk (lists in
listCidShardUnion
) and extract all their children in turn together. (It might be a valid optimization and this comment is not against it, just advocating for more documentation around it). I'm not sure if this affects the traversal behavior expected byTestHAMTEnumerationWhenComputingSize
; I don't think so but need more time to think about it.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(edit: it was affecting tests, see comment below)