Skip to content

Commit

Permalink
Add sufficient deadlines and countermeasures to handle hung node scen…
Browse files Browse the repository at this point in the history
…ario

Signed-off-by: Shubhendu Ram Tripathi <shubhendu@minio.io>
Signed-off-by: Harshavardhana <harsha@minio.io>
  • Loading branch information
harshavardhana committed May 17, 2024
1 parent fc4561c commit 08e342f
Show file tree
Hide file tree
Showing 16 changed files with 212 additions and 99 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/mint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ jobs:
run: |
${GITHUB_WORKSPACE}/.github/workflows/run-mint.sh "erasure" "minio" "minio123" "${{ steps.vars.outputs.sha_short }}"
- name: resiliency
run: |
${GITHUB_WORKSPACE}/.github/workflows/run-mint.sh "resiliency" "minio" "minio123" "${{ steps.vars.outputs.sha_short }}"
- name: The job must cleanup
if: ${{ always() }}
run: |
Expand Down
78 changes: 78 additions & 0 deletions .github/workflows/mint/minio-resiliency.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
version: '3.7'

# Settings and configurations that are common for all containers
x-minio-common: &minio-common
image: quay.io/minio/minio:${JOB_NAME}
command: server --console-address ":9001" http://minio{1...4}/rdata{1...2}
expose:
- "9000"
- "9001"
environment:
MINIO_CI_CD: "on"
MINIO_ROOT_USER: "minio"
MINIO_ROOT_PASSWORD: "minio123"
MINIO_KMS_SECRET_KEY: "my-minio-key:OSMM+vkKUTCvQs9YL/CVMIMt43HFhkUpqJxTmGl6rYw="
MINIO_DRIVE_MAX_TIMEOUT: "5s"
healthcheck:
test: ["CMD", "mc", "ready", "local"]
interval: 5s
timeout: 5s
retries: 5

# starts 4 docker containers running minio server instances.
# using nginx reverse proxy, load balancing, you can access
# it through port 9000.
services:
minio1:
<<: *minio-common
hostname: minio1
volumes:
- rdata1-1:/rdata1
- rdata1-2:/rdata2

minio2:
<<: *minio-common
hostname: minio2
volumes:
- rdata2-1:/rdata1
- rdata2-2:/rdata2

minio3:
<<: *minio-common
hostname: minio3
volumes:
- rdata3-1:/rdata1
- rdata3-2:/rdata2

minio4:
<<: *minio-common
hostname: minio4
volumes:
- rdata4-1:/rdata1
- rdata4-2:/rdata2

nginx:
image: nginx:1.19.2-alpine
hostname: nginx
volumes:
- ./nginx-4-node.conf:/etc/nginx/nginx.conf:ro
ports:
- "9000:9000"
- "9001:9001"
depends_on:
- minio1
- minio2
- minio3
- minio4

## By default this config uses default local driver,
## For custom volumes replace with volume driver configuration.
volumes:
rdata1-1:
rdata1-2:
rdata2-1:
rdata2-2:
rdata3-1:
rdata3-2:
rdata4-1:
rdata4-2:
5 changes: 4 additions & 1 deletion .github/workflows/run-mint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ docker volume rm $(docker volume ls -f dangling=true) || true
cd .github/workflows/mint

docker-compose -f minio-${MODE}.yaml up -d
sleep 30s
sleep 1m

docker system prune -f || true
docker volume prune -f || true
Expand All @@ -26,6 +26,9 @@ docker volume rm $(docker volume ls -q -f dangling=true) || true
[ "${MODE}" == "pools" ] && docker-compose -f minio-${MODE}.yaml stop minio2
[ "${MODE}" == "pools" ] && docker-compose -f minio-${MODE}.yaml stop minio6

# Pause one node, to check that all S3 calls work while one node goes wrong
[ "${MODE}" == "resiliency" ] && docker-compose -f minio-${MODE}.yaml pause minio4

docker run --rm --net=mint_default \
--name="mint-${MODE}-${JOB_NAME}" \
-e SERVER_ENDPOINT="nginx:9000" \
Expand Down
12 changes: 7 additions & 5 deletions cmd/admin-server-info.go
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// Copyright (c) 2015-2021 MinIO, Inc.
// Copyright (c) 2015-2024 MinIO, Inc.
//
// This file is part of MinIO Object Storage stack
//
Expand All @@ -18,7 +18,6 @@
package cmd

import (
"context"
"math"
"net/http"
"os"
Expand All @@ -31,6 +30,7 @@ import (
"github.com/minio/madmin-go/v3"
"github.com/minio/minio/internal/config"
"github.com/minio/minio/internal/kms"
xnet "github.com/minio/pkg/v2/net"
)

// getLocalServerProperty - returns madmin.ServerProperties for only the
Expand Down Expand Up @@ -64,9 +64,11 @@ func getLocalServerProperty(endpointServerPools EndpointServerPools, r *http.Req
if err := isServerResolvable(endpoint, 5*time.Second); err == nil {
network[nodeName] = string(madmin.ItemOnline)
} else {
network[nodeName] = string(madmin.ItemOffline)
// log once the error
peersLogOnceIf(context.Background(), err, nodeName)
if xnet.IsNetworkOrHostDown(err, false) {
network[nodeName] = string(madmin.ItemOffline)
} else if xnet.IsNetworkOrHostDown(err, true) {
network[nodeName] = "connection attempt timedout"
}
}
}
}
Expand Down
12 changes: 4 additions & 8 deletions cmd/common-main.go
Original file line number Diff line number Diff line change
Expand Up @@ -404,18 +404,14 @@ func buildServerCtxt(ctx *cli.Context, ctxt *serverCtxt) (err error) {

ctxt.FTP = ctx.StringSlice("ftp")
ctxt.SFTP = ctx.StringSlice("sftp")

ctxt.Interface = ctx.String("interface")
ctxt.UserTimeout = ctx.Duration("conn-user-timeout")
ctxt.ConnReadDeadline = ctx.Duration("conn-read-deadline")
ctxt.ConnWriteDeadline = ctx.Duration("conn-write-deadline")
ctxt.ConnClientReadDeadline = ctx.Duration("conn-client-read-deadline")
ctxt.ConnClientWriteDeadline = ctx.Duration("conn-client-write-deadline")
ctxt.SendBufSize = ctx.Int("send-buf-size")
ctxt.RecvBufSize = ctx.Int("recv-buf-size")

ctxt.ShutdownTimeout = ctx.Duration("shutdown-timeout")
ctxt.IdleTimeout = ctx.Duration("idle-timeout")
ctxt.UserTimeout = ctx.Duration("conn-user-timeout")
ctxt.ShutdownTimeout = ctx.Duration("shutdown-timeout")
ctxt.ConnReadDeadline = ctx.Duration("conn-read-deadline")
ctxt.ConnWriteDeadline = ctx.Duration("conn-write-deadline")
ctxt.ReadHeaderTimeout = ctx.Duration("read-header-timeout")
ctxt.MaxIdleConnsPerHost = ctx.Int("max-idle-conns-per-host")

Expand Down
7 changes: 4 additions & 3 deletions cmd/erasure-multipart.go
Original file line number Diff line number Diff line change
Expand Up @@ -657,7 +657,8 @@ func (er erasureObjects) PutObjectPart(ctx context.Context, bucket, object, uplo
if err != nil {
return PartInfo{}, err
}
pctx := plkctx.Context()

ctx = plkctx.Context()
defer partIDLock.Unlock(plkctx)

onlineDisks := er.getDisks()
Expand Down Expand Up @@ -689,7 +690,7 @@ func (er erasureObjects) PutObjectPart(ctx context.Context, bucket, object, uplo
}
}()

erasure, err := NewErasure(pctx, fi.Erasure.DataBlocks, fi.Erasure.ParityBlocks, fi.Erasure.BlockSize)
erasure, err := NewErasure(ctx, fi.Erasure.DataBlocks, fi.Erasure.ParityBlocks, fi.Erasure.BlockSize)
if err != nil {
return pi, toObjectErr(err, bucket, object)
}
Expand Down Expand Up @@ -742,7 +743,7 @@ func (er erasureObjects) PutObjectPart(ctx context.Context, bucket, object, uplo
}
}

n, err := erasure.Encode(pctx, toEncode, writers, buffer, writeQuorum)
n, err := erasure.Encode(ctx, toEncode, writers, buffer, writeQuorum)
closeBitrotWriters(writers)
if err != nil {
return pi, toObjectErr(err, bucket, object)
Expand Down
6 changes: 5 additions & 1 deletion cmd/grid.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,12 @@ var globalGridStart = make(chan struct{})

func initGlobalGrid(ctx context.Context, eps EndpointServerPools) error {
hosts, local := eps.GridHosts()
lookupHost := globalDNSCache.LookupHost
g, err := grid.NewManager(ctx, grid.ManagerOptions{
Dialer: grid.ContextDialer(xhttp.DialContextWithLookupHost(globalDNSCache.LookupHost, xhttp.NewInternodeDialContext(rest.DefaultTimeout, globalTCPOptions))),
// Pass Dialer for websocket grid, make sure we do not
// provide any DriveOPTimeout() function, as that is not
// useful over persistent connections.
Dialer: grid.ContextDialer(xhttp.DialContextWithLookupHost(lookupHost, xhttp.NewInternodeDialContext(rest.DefaultTimeout, globalTCPOptions.ForWebsocket()))),
Local: local,
Hosts: hosts,
AddAuth: newCachedAuthToken(),
Expand Down
2 changes: 1 addition & 1 deletion cmd/post-policy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ func testPostPolicyBucketHandler(obj ObjectLayer, instanceType string, t TestErr
// Call the ServeHTTP to execute the handler.
apiRouter.ServeHTTP(rec, req)
if rec.Code != test.expectedStatus {
t.Fatalf("Test %d: %s: Expected the response status to be `%d`, but instead found `%d`", i+1, instanceType, test.expectedStatus, rec.Code)
t.Fatalf("Test %d: %s: Expected the response status to be `%d`, but instead found `%d`, Resp: %s", i+1, instanceType, test.expectedStatus, rec.Code, rec.Body)
}
}

Expand Down
17 changes: 3 additions & 14 deletions cmd/server-main.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,18 +103,6 @@ var ServerFlags = []cli.Flag{
EnvVar: "MINIO_READ_HEADER_TIMEOUT",
Hidden: true,
},
cli.DurationFlag{
Name: "conn-client-read-deadline",
Usage: "custom connection READ deadline for incoming requests",
Hidden: true,
EnvVar: "MINIO_CONN_CLIENT_READ_DEADLINE",
},
cli.DurationFlag{
Name: "conn-client-write-deadline",
Usage: "custom connection WRITE deadline for outgoing requests",
Hidden: true,
EnvVar: "MINIO_CONN_CLIENT_WRITE_DEADLINE",
},
cli.DurationFlag{
Name: "conn-read-deadline",
Usage: "custom connection READ deadline",
Expand Down Expand Up @@ -441,8 +429,9 @@ func serverHandleCmdArgs(ctxt serverCtxt) {

globalTCPOptions = xhttp.TCPOptions{
UserTimeout: int(ctxt.UserTimeout.Milliseconds()),
ClientReadTimeout: ctxt.ConnClientReadDeadline,
ClientWriteTimeout: ctxt.ConnClientWriteDeadline,
ClientReadTimeout: ctxt.ConnReadDeadline,
ClientWriteTimeout: ctxt.ConnWriteDeadline,
DriveOPTimeout: globalDriveConfig.GetOPTimeout,
Interface: ctxt.Interface,
SendBufSize: ctxt.SendBufSize,
RecvBufSize: ctxt.RecvBufSize,
Expand Down
28 changes: 13 additions & 15 deletions internal/config/drive/drive.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,18 @@ import (
"github.com/minio/pkg/v2/env"
)

// Drive specific timeout environment variables
const (
envMaxDriveTimeout = "MINIO_DRIVE_MAX_TIMEOUT"
EnvMaxDriveTimeout = "MINIO_DRIVE_MAX_TIMEOUT"
EnvMaxDriveTimeoutLegacy = "_MINIO_DRIVE_MAX_TIMEOUT"
EnvMaxDiskTimeoutLegacy = "_MINIO_DISK_MAX_TIMEOUT"
)

// DefaultKVS - default KVS for drive
var DefaultKVS = config.KVS{
config.KV{
Key: MaxTimeout,
Value: "",
Value: "30s",
},
}

Expand All @@ -53,8 +56,13 @@ func (c *Config) Update(new Config) error {
return nil
}

// GetMaxTimeout - returns the max timeout value.
// GetMaxTimeout - returns the per call drive operation timeout
func (c *Config) GetMaxTimeout() time.Duration {
return c.GetOPTimeout()
}

// GetOPTimeout - returns the per call drive operation timeout
func (c *Config) GetOPTimeout() time.Duration {
configLk.RLock()
defer configLk.RUnlock()

Expand All @@ -71,14 +79,7 @@ func LookupConfig(kvs config.KVS) (cfg Config, err error) {
}

// if not set. Get default value from environment
d := env.Get(envMaxDriveTimeout, kvs.GetWithDefault(MaxTimeout, DefaultKVS))
if d == "" {
d = env.Get("_MINIO_DRIVE_MAX_TIMEOUT", "")
if d == "" {
d = env.Get("_MINIO_DISK_MAX_TIMEOUT", "")
}
}

d := env.Get(EnvMaxDriveTimeout, env.Get(EnvMaxDriveTimeoutLegacy, env.Get(EnvMaxDiskTimeoutLegacy, kvs.GetWithDefault(MaxTimeout, DefaultKVS))))
dur, _ := time.ParseDuration(d)
if dur < time.Second {
cfg.MaxTimeout = 30 * time.Second
Expand All @@ -91,10 +92,7 @@ func LookupConfig(kvs config.KVS) (cfg Config, err error) {
func getMaxTimeout(t time.Duration) time.Duration {
if t < time.Second {
// get default value
d := env.Get("_MINIO_DRIVE_MAX_TIMEOUT", "")
if d == "" {
d = env.Get("_MINIO_DISK_MAX_TIMEOUT", "")
}
d := env.Get(EnvMaxDriveTimeoutLegacy, env.Get(EnvMaxDiskTimeoutLegacy, ""))
dur, _ := time.ParseDuration(d)
if dur < time.Second {
return 30 * time.Second
Expand Down
3 changes: 2 additions & 1 deletion internal/config/drive/help.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,13 @@ import "github.com/minio/minio/internal/config"
var (
// MaxTimeout is the max timeout for drive
MaxTimeout = "max_timeout"

// HelpDrive is help for drive
HelpDrive = config.HelpKVS{
config.HelpKV{
Key: MaxTimeout,
Type: "string",
Description: "set per call max_timeout for the drive, defaults to 2 minutes",
Description: "set per call max_timeout for the drive, defaults to 30 seconds",
Optional: true,
},
}
Expand Down

0 comments on commit 08e342f

Please sign in to comment.