Skip to content

Return multiple errors from Writer.WriteMessages and MessageTooLargeError handling improvements #401

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 18 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
62bb464
Export WriterError struct type and associated fields.
Evanjt1 Oct 17, 2019
046b6b6
Relocate WriterError so it is grouped with other exported types in wr…
Evanjt1 Oct 17, 2019
46fffc3
Fix bug in writer.write where messages were being sent to the incorre…
Evanjt1 Oct 23, 2019
adc009d
Add id field to writerMessage struct type.
Evanjt1 Oct 23, 2019
7aa5503
Set id when creating writerMessages struct in writer.WriteMessages.
Evanjt1 Oct 24, 2019
426ab01
Add writerResponse struct type to wrap write errors with message id.
Evanjt1 Oct 24, 2019
d6aaf60
Return writerResponse in fakeWriter.messages() in writer_test.
Evanjt1 Oct 28, 2019
0891d29
Add test for kafka 2.2.1 compatibility to circle ci config.
Evanjt1 Nov 6, 2019
0bd6dc0
Remove unused WriterError.Cause method
Evanjt1 Feb 5, 2020
4963dac
Add WriterErrors type to wrap a slice of WriterError
Evanjt1 Feb 5, 2020
67b360e
Change err type in writerResponse to *WriterError
Evanjt1 Feb 11, 2020
d5ef30b
Return multiple errors from Writer.WriteMessages and try entire batch…
Evanjt1 Feb 11, 2020
427f1ed
Rename receiver in WriterErrors.Error method
Evanjt1 Feb 11, 2020
4a268bb
Add writerTestCase type which provides common methods for Writer tests
Evanjt1 Feb 11, 2020
407ea25
Update tests for Writer type
Evanjt1 Feb 11, 2020
9e8274a
Fix race condition by making testRetryWriter type thread safe
Evanjt1 Feb 11, 2020
0876ebf
Add Unwrap method to WriterError type.
Evanjt1 Feb 24, 2020
187a291
Merge remote-tracking branch 'upstream/master' into writer-return-errors
Evanjt1 Feb 25, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,37 @@ jobs:
- run: go get -v -t . ./gzip ./lz4 ./sasl ./snappy
- run: go test -v -race -cover -timeout 150s $(go list ./... | grep -v examples)

kafka-221:
working_directory: /go/src/github.com/segmentio/kafka-go
environment:
KAFKA_VERSION: "2.2.1"
docker:
- image: circleci/golang
- image: wurstmeister/zookeeper
ports: ['2181:2181']
- image: wurstmeister/kafka:2.12-2.2.1
ports: ['9092:9092','9093:9093']
environment:
KAFKA_BROKER_ID: '1'
KAFKA_CREATE_TOPICS: 'test-writer-0:3:1,test-writer-1:3:1'
KAFKA_DELETE_TOPIC_ENABLE: 'true'
KAFKA_ADVERTISED_HOST_NAME: 'localhost'
KAFKA_ADVERTISED_PORT: '9092'
KAFKA_ZOOKEEPER_CONNECT: 'localhost:2181'
KAFKA_AUTO_CREATE_TOPICS_ENABLE: 'true'
KAFKA_LISTENERS: 'PLAINTEXT://:9092,SASL_PLAINTEXT://:9093'
KAFKA_ADVERTISED_LISTENERS: 'PLAINTEXT://localhost:9092,SASL_PLAINTEXT://localhost:9093'
KAFKA_SASL_ENABLED_MECHANISMS: SCRAM-SHA-256,SCRAM-SHA-512,PLAIN
KAFKA_OPTS: "-Djava.security.auth.login.config=/opt/kafka/config/kafka_server_jaas.conf"
CUSTOM_INIT_SCRIPT: |-
echo -e 'KafkaServer {\norg.apache.kafka.common.security.scram.ScramLoginModule required\n username="adminscram"\n password="admin-secret";\n org.apache.kafka.common.security.plain.PlainLoginModule required\n username="adminplain"\n password="admin-secret"\n user_adminplain="admin-secret";\n };' > /opt/kafka/config/kafka_server_jaas.conf;
/opt/kafka/bin/kafka-configs.sh --zookeeper localhost:2181 --alter --add-config 'SCRAM-SHA-256=[password=admin-secret-256],SCRAM-SHA-512=[password=admin-secret-512]' --entity-type users --entity-name adminscram
steps:
- checkout
- setup_remote_docker: { reusable: true, docker_layer_caching: true }
- run: go get -v -t . ./gzip ./lz4 ./sasl ./snappy
- run: go test -v -race -cover -timeout 150s $(go list ./... | grep -v examples)

workflows:
version: 2
run:
Expand All @@ -131,3 +162,4 @@ workflows:
- kafka-011
- kafka-111
- kafka-210
- kafka-221
216 changes: 147 additions & 69 deletions writer.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"io"
"math/rand"
"sort"
"strings"
"sync"
"time"
)
Expand Down Expand Up @@ -133,6 +134,45 @@ type WriterConfig struct {
newPartitionWriter func(partition int, config WriterConfig, stats *writerStats) partitionWriter
}

type WriterError struct {
Msg Message
Err error
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we have the WriterError and WriterErrors type implement errors.Wrapper?

Copy link
Author

@Evanjt1 Evanjt1 Feb 24, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do, Unwrap() is straightforward for WriterError. Do you have thoughts on behavior for WriterErrors?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


func (e *WriterError) Error() string {
return e.Err.Error()
}

func (e *WriterError) Temporary() bool {
return isTemporary(e.Err)
}

func (e *WriterError) Timeout() bool {
return isTimeout(e.Err)
}

func (e *WriterError) Unwrap() error {
return e.Err
}

type WriterErrors []WriterError

func (wes WriterErrors) Error() string {
if len(wes) == 1 {
return fmt.Sprintf("1 WriterError occurred:\n\t* %s\n", wes[0].Err)
}

points := make([]string, len(wes))
for i, we := range wes {
points[i] = fmt.Sprintf("* %s", we.Err)
}

return fmt.Sprintf(
"%d WriterErrors occurred:\n\t%s\n",
len(wes),
strings.Join(points, "\n\t"))
}

// WriterStats is a data structure returned by a call to Writer.Stats that
// exposes details about the behavior of the writer.
type WriterStats struct {
Expand Down Expand Up @@ -302,57 +342,93 @@ func (w *Writer) WriteMessages(ctx context.Context, msgs ...Message) error {
return nil
}

var err error
var res chan error
errs := make(WriterErrors, 0, len(msgs))
var res chan writerResponse
if !w.config.Async {
res = make(chan error, len(msgs))
res = make(chan writerResponse, len(msgs))
}
t0 := time.Now()
defer w.stats.writeTime.observeDuration(time.Since(t0))

handled := make(map[int]bool, len(msgs))
w.mutex.RLock()
closed := w.closed
w.mutex.RUnlock()

if closed {
return io.ErrClosedPipe
if w.closed {
w.mutex.RUnlock()
for _, m := range msgs {
errs = append(errs, WriterError{
Msg: m,
Err: io.ErrClosedPipe,
})
}
return errs
}

for i, msg := range msgs {

if int(msg.size()) > w.config.BatchBytes {
err := MessageTooLargeError{
Message: msg,
Remaining: msgs[i+1:],
errs = append(errs, WriterError{
Msg: msg,
Err: MessageTooLargeError{
Message: msg,
},
})
handled[i] = true
} else {
select {
case w.msgs <- writerMessage{
msg: msg,
res: res,
id: i,
}:
case <-ctx.Done():
w.mutex.RUnlock()
for j, m := range msgs {
// don't double count MessageTooLargeErrors which may already be present in errs
if _, ok := handled[j]; !ok {
errs = append(errs, WriterError{
Msg: m,
Err: ctx.Err(),
})
}
}
return errs
}
return err
}

wm := writerMessage{msg: msg, res: res}

select {
case w.msgs <- wm:
case <-ctx.Done():
return ctx.Err()
}
}

w.mutex.RUnlock()
if w.config.Async {
if len(errs) > 0 {
return errs
}
return nil
}

for i := 0; i != len(msgs); i++ {
sent := len(msgs) - len(handled)
for i := 0; i != sent; i++ {
select {
case e := <-res:
if e != nil {
err = e
case r := <-res:
handled[r.id] = true
if r.err != nil {
errs = append(errs, *r.err)
}
case <-ctx.Done():
return ctx.Err()
// all unacked msgs become errors
for x := range msgs {
if _, ok := handled[x]; !ok {
errs = append(errs, WriterError{
Msg: msgs[x],
Err: ctx.Err(),
})
}
}
return errs
}
}

return err
if len(errs) > 0 {
return errs
}
return nil
}

// Stats returns a snapshot of the writer stats since the last time the method
Expand Down Expand Up @@ -459,7 +535,13 @@ func (w *Writer) run() {
err = fmt.Errorf("failed to find any partitions for topic %s", w.config.Topic)
}
if wm.res != nil {
wm.res <- &writerError{msg: wm.msg, err: err}
wm.res <- writerResponse{
id: wm.id,
err: &WriterError{
Msg: wm.msg,
Err: err,
},
}
}
}

Expand Down Expand Up @@ -599,7 +681,8 @@ func (w *writer) run() {
var conn *Conn
var done bool
var batch = make([]Message, 0, w.batchSize)
var resch = make([](chan<- error), 0, w.batchSize)
var resch = make([](chan<- writerResponse), 0, w.batchSize)
var ids = make([]int, 0, w.batchSize)
var lastMsg writerMessage
var batchSizeBytes int
var idleConnDeadline time.Time
Expand All @@ -616,9 +699,8 @@ func (w *writer) run() {
// If a lstMsg exists we need to add it to the batch so we don't lose it.
if len(lastMsg.msg.Value) != 0 {
batch = append(batch, lastMsg.msg)
if lastMsg.res != nil {
resch = append(resch, lastMsg.res)
}
resch = append(resch, lastMsg.res)
ids = append(ids, lastMsg.id)
batchSizeBytes += int(lastMsg.msg.size())
lastMsg = writerMessage{}
if !batchTimerRunning {
Expand All @@ -639,9 +721,8 @@ func (w *writer) run() {
break
}
batch = append(batch, wm.msg)
if wm.res != nil {
resch = append(resch, wm.res)
}
resch = append(resch, wm.res)
ids = append(ids, wm.id)
batchSizeBytes += int(wm.msg.size())
mustFlush = len(batch) >= w.batchSize || batchSizeBytes >= w.maxMessageBytes
}
Expand Down Expand Up @@ -672,7 +753,7 @@ func (w *writer) run() {
}

var err error
if conn, err = w.writeWithRetries(conn, batch, resch); err != nil {
if conn, err = w.writeWithRetries(conn, batch, resch, ids); err != nil {
if conn != nil {
conn.Close()
conn = nil
Expand All @@ -687,8 +768,13 @@ func (w *writer) run() {
for i := range resch {
resch[i] = nil
}

for i := range ids {
ids[i] = -1
}
batch = batch[:0]
resch = resch[:0]
ids = ids[:0]
batchSizeBytes = 0
}
}
Expand All @@ -708,31 +794,44 @@ func (w *writer) dial() (conn *Conn, err error) {
return
}

func (w *writer) writeWithRetries(conn *Conn, batch []Message, resch [](chan<- error)) (*Conn, error) {
func (w *writer) writeWithRetries(conn *Conn, batch []Message, resch [](chan<- writerResponse), ids []int) (*Conn, error) {
var err error

for attempt := 0; attempt < w.maxAttempts; attempt++ {
conn, err = w.write(conn, batch, resch)
conn, err = w.write(conn, batch, resch, ids)
if err == nil {
break
}
w.stats.retries.observe(1)
time.Sleep(backoff(attempt+1, 100*time.Millisecond, 1*time.Second))
}

for i, res := range resch {
if res != nil {
var we *WriterError
if err != nil {
we = &WriterError{
Msg: batch[i],
Err: err,
}
}
res <- writerResponse{
id: ids[i],
err: we,
}
}
}

return conn, err
}

func (w *writer) write(conn *Conn, batch []Message, resch [](chan<- error)) (ret *Conn, err error) {
func (w *writer) write(conn *Conn, batch []Message, resch [](chan<- writerResponse), ids []int) (ret *Conn, err error) {
w.stats.writes.observe(1)
if conn == nil {
if conn, err = w.dial(); err != nil {
w.stats.errors.observe(1)
w.withErrorLogger(func(logger Logger) {
logger.Printf("error dialing kafka brokers for topic %s (partition %d): %s", w.topic, w.partition, err)
})
for i, res := range resch {
res <- &writerError{msg: batch[i], err: err}
}
return
}
}
Expand All @@ -744,17 +843,11 @@ func (w *writer) write(conn *Conn, batch []Message, resch [](chan<- error)) (ret
w.withErrorLogger(func(logger Logger) {
logger.Printf("error writing messages to %s (partition %d): %s", w.topic, w.partition, err)
})
for i, res := range resch {
res <- &writerError{msg: batch[i], err: err}
}
} else {
for _, m := range batch {
w.stats.messages.observe(1)
w.stats.bytes.observe(int64(len(m.Key) + len(m.Value)))
}
for _, res := range resch {
res <- nil
}
}
t1 := time.Now()
w.stats.waitTime.observeDuration(t1.Sub(t0))
Expand All @@ -766,28 +859,13 @@ func (w *writer) write(conn *Conn, batch []Message, resch [](chan<- error)) (ret

type writerMessage struct {
msg Message
res chan<- error
}

type writerError struct {
msg Message
err error
}

func (e *writerError) Cause() error {
return e.err
}

func (e *writerError) Error() string {
return e.err.Error()
}

func (e *writerError) Temporary() bool {
return isTemporary(e.err)
res chan<- writerResponse
id int
}

func (e *writerError) Timeout() bool {
return isTimeout(e.err)
type writerResponse struct {
id int
err *WriterError
}

func shuffledStrings(list []string) []string {
Expand Down
Loading