github.com/NVIDIA/aistore@v1.3.23-0.20240517131212-7df6609be51d/transport/base.go (about) 1 // Package transport provides long-lived http/tcp connections for 2 // intra-cluster communications (see README for details and usage example). 3 /* 4 * Copyright (c) 2018-2023, NVIDIA CORPORATION. All rights reserved. 5 */ 6 package transport 7 8 import ( 9 "fmt" 10 "io" 11 "net/url" 12 "os" 13 "path" 14 "strconv" 15 "sync" 16 "time" 17 18 "github.com/NVIDIA/aistore/api/apc" 19 "github.com/NVIDIA/aistore/cmn" 20 "github.com/NVIDIA/aistore/cmn/atomic" 21 "github.com/NVIDIA/aistore/cmn/cos" 22 "github.com/NVIDIA/aistore/cmn/debug" 23 "github.com/NVIDIA/aistore/cmn/nlog" 24 ) 25 26 // stream TCP/HTTP session: inactive <=> active transitions 27 const ( 28 inactive = iota 29 active 30 ) 31 32 // in-send states 33 const ( 34 inHdr = iota + 1 35 inPDU 36 inData 37 inEOB 38 ) 39 40 const maxInReadRetries = 64 // Tx: lz4 stream read; Rx: partial object header 41 42 // termination: reasons 43 const ( 44 reasonError = "error" 45 endOfStream = "end-of-stream" 46 reasonStopped = "stopped" 47 48 connErrWait = time.Second // ECONNREFUSED | ECONNRESET | EPIPE 49 termErrWait = time.Second 50 ) 51 52 type ( 53 streamer interface { 54 compressed() bool 55 dryrun() 56 terminate(error, string) (string, error) 57 doRequest() error 58 inSend() bool 59 abortPending(error, bool) 60 errCmpl(error) 61 resetCompression() 62 // gc 63 closeAndFree() 64 drain(err error) 65 idleTick() 66 } 67 streamBase struct { 68 streamer streamer 69 client Client // stream's http client 70 stopCh cos.StopCh // stop/abort stream 71 lastCh cos.StopCh // end-of-stream 72 pdu *spdu // PDU buffer 73 postCh chan struct{} // to indicate that workCh has work 74 trname string // http endpoint: (trname, dstURL, dstID) 75 dstURL string 76 dstID string 77 lid string // log prefix 78 maxhdr []byte // header buf must be large enough to accommodate max-size for this stream 79 header []byte // object header (slice of the maxhdr with bucket/objName, etc. fields packed/serialized) 80 term struct { 81 err error 82 reason string 83 mu sync.Mutex 84 done atomic.Bool 85 } 86 stats Stats // stream stats (send side - compare with rxStats) 87 time struct { 88 idleTeardown time.Duration // idle timeout 89 inSend atomic.Bool // true upon Send() or Read() - info for Collector to delay cleanup 90 ticks int // num 1s ticks until idle timeout 91 index int // heap stuff 92 } 93 wg sync.WaitGroup 94 sessST atomic.Int64 // state of the TCP/HTTP session: active (connected) | inactive (disconnected) 95 sessID int64 // stream session ID 96 numCur int64 // gets reset to zero upon each timeout 97 sizeCur int64 // ditto 98 chanFull atomic.Int64 99 } 100 ) 101 102 //////////////// 103 // streamBase // 104 //////////////// 105 106 func newBase(client Client, dstURL, dstID string, extra *Extra) (s *streamBase) { 107 var ( 108 sid string 109 u, err = url.Parse(dstURL) 110 ) 111 debug.AssertNoErr(err) 112 113 s = &streamBase{client: client, dstURL: dstURL, dstID: dstID} 114 115 s.sessID = nextSessionID.Inc() 116 s.trname = path.Base(u.Path) 117 118 s.lastCh.Init() 119 s.stopCh.Init() 120 s.postCh = make(chan struct{}, 1) 121 122 // default overrides 123 if extra.SenderID != "" { 124 sid = "-" + extra.SenderID 125 } 126 // NOTE: PDU-based traffic - a MUST-have for "unsized" transmissions 127 if extra.UsePDU() { 128 if extra.SizePDU > maxSizePDU { 129 debug.Assert(false) 130 extra.SizePDU = maxSizePDU 131 } 132 buf, _ := g.mm.AllocSize(int64(extra.SizePDU)) 133 s.pdu = newSendPDU(buf) 134 } 135 if extra.IdleTeardown > 0 { 136 s.time.idleTeardown = extra.IdleTeardown 137 } else { 138 s.time.idleTeardown = extra.Config.Transport.IdleTeardown.D() 139 if s.time.idleTeardown == 0 { 140 s.time.idleTeardown = dfltIdleTeardown 141 } 142 } 143 debug.Assertf(s.time.idleTeardown >= dfltTick, "%v vs. %v", s.time.idleTeardown, dfltTick) 144 s.time.ticks = int(s.time.idleTeardown / dfltTick) 145 146 s.lid = fmt.Sprintf("s-%s%s[%d]=>%s", s.trname, sid, s.sessID, dstID) 147 148 if extra.MaxHdrSize == 0 { 149 s.maxhdr, _ = g.mm.AllocSize(dfltMaxHdr) 150 } else { 151 s.maxhdr, _ = g.mm.AllocSize(int64(extra.MaxHdrSize)) 152 cos.AssertMsg(extra.MaxHdrSize <= 0xffff, "the field is uint16") // same comment in header.go 153 } 154 s.sessST.Store(inactive) // initiate HTTP session upon the first arrival 155 return 156 } 157 158 func (s *streamBase) startSend(streamable fmt.Stringer) (err error) { 159 s.time.inSend.Store(true) // StreamCollector to postpone cleanups 160 161 if s.IsTerminated() { 162 // slow path 163 reason, errT := s.TermInfo() 164 err = cmn.NewErrStreamTerminated(s.String(), errT, reason, "dropping "+streamable.String()) 165 nlog.Errorln(err) 166 return 167 } 168 169 if s.sessST.CAS(inactive, active) { 170 s.postCh <- struct{}{} 171 if verbose { 172 nlog.Infof("%s: inactive => active", s) 173 } 174 } 175 return 176 } 177 178 func (s *streamBase) Stop() { s.stopCh.Close() } 179 func (s *streamBase) URL() string { return s.dstURL } 180 func (s *streamBase) ID() (string, int64) { return s.trname, s.sessID } 181 func (s *streamBase) String() string { return s.lid } 182 183 func (s *streamBase) Abort() { s.Stop() } // (DM =>) SB => s.Abort() sequence (e.g. usage see otherXreb.Abort()) 184 185 func (s *streamBase) IsTerminated() bool { return s.term.done.Load() } 186 187 func (s *streamBase) TermInfo() (reason string, err error) { 188 // to account for an unlikely delay between done.CAS() and mu.Lock - see terminate() 189 sleep := cos.ProbingFrequency(termErrWait) 190 for elapsed := time.Duration(0); elapsed < termErrWait; elapsed += sleep { 191 s.term.mu.Lock() 192 reason, err = s.term.reason, s.term.err 193 s.term.mu.Unlock() 194 if reason != "" { 195 break 196 } 197 time.Sleep(sleep) 198 } 199 return 200 } 201 202 func (s *streamBase) GetStats() (stats Stats) { 203 // byte-num transfer stats 204 stats.Num.Store(s.stats.Num.Load()) 205 stats.Offset.Store(s.stats.Offset.Load()) 206 stats.Size.Store(s.stats.Size.Load()) 207 stats.CompressedSize.Store(s.stats.CompressedSize.Load()) 208 return 209 } 210 211 func (s *streamBase) isNextReq() (reason string) { 212 for { 213 select { 214 case <-s.lastCh.Listen(): 215 if verbose { 216 nlog.Infof("%s: end-of-stream", s) 217 } 218 reason = endOfStream 219 return 220 case <-s.stopCh.Listen(): 221 if verbose { 222 nlog.Infof("%s: stopped", s) 223 } 224 reason = reasonStopped 225 return 226 case <-s.postCh: 227 s.sessST.Store(active) 228 if verbose { 229 nlog.Infof("%s: active <- posted", s) 230 } 231 return 232 } 233 } 234 } 235 236 func (s *streamBase) deactivate() (n int, err error) { 237 err = io.EOF 238 if verbose { 239 num := s.stats.Num.Load() 240 nlog.Infof("%s: connection teardown (%d/%d)", s, s.numCur, num) 241 } 242 return 243 } 244 245 func (s *streamBase) sendLoop(dryrun bool) { 246 var ( 247 err error 248 reason string 249 retried bool 250 ) 251 for { 252 if s.sessST.Load() == active { 253 if dryrun { 254 s.streamer.dryrun() 255 } else if errR := s.streamer.doRequest(); errR != nil { 256 if !cos.IsRetriableConnErr(err) || retried { 257 reason = reasonError 258 err = errR 259 s.streamer.errCmpl(err) 260 break 261 } 262 retried = true 263 nlog.Errorf("%s: %v - retrying...", s, errR) 264 time.Sleep(connErrWait) 265 } 266 } 267 if reason = s.isNextReq(); reason != "" { 268 break 269 } 270 } 271 272 reason, err = s.streamer.terminate(err, reason) 273 s.wg.Done() 274 275 if reason == endOfStream { 276 return 277 } 278 279 // termination is caused by anything other than Fin() 280 // (reasonStopped is, effectively, abort via Stop() - totally legit) 281 if reason != reasonStopped { 282 nlog.Errorf("%s: terminating (%s, %v)", s, reason, err) 283 } 284 285 // wait for the SCQ/cmplCh to empty 286 s.wg.Wait() 287 288 // cleanup 289 s.streamer.abortPending(err, false /*completions*/) 290 291 if cnt := s.chanFull.Load(); (cnt >= 10 && cnt <= 20) || (cnt > 0 && verbose) { 292 nlog.Errorln("work channel full", s.lid, cnt) 293 } 294 } 295 296 /////////// 297 // Extra // 298 /////////// 299 300 func (extra *Extra) UsePDU() bool { return extra.SizePDU > 0 } 301 302 func (extra *Extra) Compressed() bool { 303 return extra.Compression != "" && extra.Compression != apc.CompressNever 304 } 305 306 // 307 // misc 308 // 309 310 func dryrun() (dryrun bool) { 311 var err error 312 if a := os.Getenv("AIS_STREAM_DRY_RUN"); a != "" { 313 if dryrun, err = strconv.ParseBool(a); err != nil { 314 nlog.Errorln(err) 315 } 316 } 317 return 318 }