8000 observability: add prometheus variable labels; remove collector · NVIDIA/aistore@118a821 · GitHub
[go: up one dir, main page]
More Web Proxy on the site http://driver.im/
Skip to content

Commit 118a821

Browse files
committed
observability: add prometheus variable labels; remove collector
* variable labels: bucket, xaction (job), mountpath * major update w/ partial rewrite * part one * remains: - disable prometheus _default_ metrics - make `statsValue` polymorphic; absorb prom.go types - keep all counters in bytes and nanoseconds (not megabytes/milliseconds) - look for a way to optimize `prometheus.Counter` - error counting buckets with a different remote backend - docs/metrics-reference Signed-off-by: Alex Aizman <alex.aizman@gmail.com>
1 parent e9a0482 commit 118a821

26 files changed

+547
-423
lines changed

ais/backend/common.go

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) {
4646
Help: "GET: total number of executed remote requests (cold GETs)",
4747
StrName: "remote_get_count",
4848
Labels: labels,
49+
VarLabs: stats.BckVarlabs,
4950
},
5051
)
5152
tr.RegExtMetric(snode,
@@ -55,6 +56,7 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) {
5556
Help: "GET: total cumulative time (nanoseconds) to execute cold GETs and store new object versions in-cluster",
5657
StrName: "remote_get_ns_total",
5758
Labels: labels,
59+
VarLabs: stats.BckVarlabs,
5860
},
5961
)
6062
tr.RegExtMetric(snode,
@@ -65,6 +67,7 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) {
6567
"includes: receiving request, executing cold-GET, storing new object version in-cluster, and transmitting response",
6668
StrName: "remote_e2e_get_ns_total",
6769
Labels: labels,
70+
VarLabs: stats.BckVarlabs,
6871
},
6972
)
7073
tr.RegExtMetric(snode,
@@ -73,7 +76,9 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) {
7376
&stats.Extra{
7477
Help: "GET: total cumulative size (bytes) of all cold-GET transactions",
7578
StrName: "remote_get_bytes_total",
76-
Labels: labels},
79+
Labels: labels,
80+
VarLabs: stats.BckVarlabs,
81+
},
7782
)
7883

7984
// PUT
@@ -89,6 +94,7 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) {
8994
Help: "PUT: total number of executed remote requests to a given backend",
9095
StrName: "remote_put_count",
9196
Labels: labels,
97+
VarLabs: stats.BckXactVarlabs,
9298
},
9399
)
94100
tr.RegExtMetric(snode,
@@ -98,6 +104,7 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) {
98104
Help: "PUT: total cumulative time (nanoseconds) to execute remote requests and store new object versions in-cluster",
99105
StrName: "remote_put_ns_total",
100106
Labels: labels,
107+
VarLabs: stats.BckXactVarlabs,
101108
},
102109
)
103110
tr.RegExtMetric(snode,
@@ -107,7 +114,9 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) {
107114
StrName: "remote_e2e_put_ns_total",
108115
Help: "PUT: total end-to-end time (nanoseconds) servicing remote requests; " +
109116
"includes: receiving PUT payload, storing it in-cluster, executing remote PUT, finalizing new in-cluster object",
110-
Labels: labels},
117+
Labels: labels,
118+
VarLabs: stats.BckXactVarlabs,
119+
},
111120
)
112121
tr.RegExtMetric(snode,
113122
b.metrics[stats.PutSize],
@@ -116,6 +125,7 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) {
116125
Help: "PUT: total cumulative size (bytes) of all PUTs to a given remote backend",
117126
StrName: "remote_e2e_put_bytes_total",
118127
Labels: labels,
128+
VarLabs: stats.BckXactVarlabs,
119129
},
120130
)
121131

@@ -130,6 +140,7 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) {
130140
Help: "HEAD: total number of executed remote requests to a given backend",
131141
StrName: "remote_head_count",
132142
Labels: labels,
143+
VarLabs: stats.BckVarlabs,
133144
},
134145
)
135146
tr.RegExtMetric(snode,
@@ -139,6 +150,7 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) {
139150
Help: "HEAD: total cumulative time (nanoseconds) to execute remote requests",
140151
StrName: "remote_head_ns_total",
141152
Labels: labels,
153+
VarLabs: stats.BckVarlabs,
142154
},
143155
)
144156

@@ -153,6 +165,7 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) {
153165
Help: "number of out-of-band updates (by a 3rd party performing remote PUTs outside this cluster)",
154166
StrName: "remote_ver_change_count",
155167
Labels: labels,
168+
VarLabs: stats.BckVarlabs,
156169
},
157170
)
158171
tr.RegExtMetric(snode,
@@ -162,6 +175,7 @@ func (b *base) init(snode *meta.Snode, tr stats.Tracker) {
162175
Help: "total cumulative size of objects that were updated out-of-band",
163176
StrName: "remote_ver_change_bytes_total",
164177
Labels: labels,
178+
VarLabs: stats.BckVarlabs,
165179
},
166180
)
167181
}

ais/htrun.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1051,7 +1051,7 @@ func (h *htrun) logerr(tag string, v any, err error) {
10511051
} else {
10521052
nlog.Errorln(msg)
10531053
}
1054-
h.statsT.IncErr(stats.ErrHTTPWriteCount)
1054+
h.statsT.Inc(stats.ErrHTTPWriteCount)
10551055
}
10561056

10571057
func _parseNCopies(value any) (copies int64, err error) {

ais/kalive.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ func (pkr *palive) updateSmap(config *cmn.Config) (stopped bool) {
284284
// otherwise, go keepalive with retries
285285
nlog.Warningln(pkr.p.String(), "failed to fast-kalive", si.StringEx(), "err: [", err, status, "]")
286286

287-
pkr.statsT.IncErr(stats.ErrKaliveCount)
287+
pkr.statsT.Inc(stats.ErrKaliveCount)
288288
wg.Add(1)
289289
go pkr.goping(si, wg, smap, config)
290290
}
@@ -344,7 +344,7 @@ func (pkr *palive) _pingRetry(si *meta.Snode, smap *smapX, config *cmn.Config) (
344344

345345
tout = config.Timeout.MaxKeepalive.D()
346346
nlog.Warningln(pname, "failed to slow-kalive", sname, "- retrying [", err, status, tout, smap.StringEx(), "]")
347-
pkr.statsT.IncErr(stats.ErrKaliveCount)
347+
pkr.statsT.Inc(stats.ErrKaliveCount)
348348

349349
ticker := time.NewTicker(cmn.KeepaliveRetryDuration(config))
350350
ok, stopped = pkr.retry(si, ticker, tout, config.Keepalive.NumRetries)
@@ -440,7 +440,7 @@ func (pkr *palive) retry(si *meta.Snode, ticker *time.Ticker, tout time.Duration
440440
return true, false
441441
}
442442

443-
pkr.statsT.IncErr(stats.ErrKaliveCount)
443+
pkr.statsT.Inc(stats.ErrKaliveCount)
444444
i++
445445

446446
if i >= kaNumRetries {
@@ -583,7 +583,7 @@ func (k *keepalive) do(smap *smapX, si *meta.Snode, config *cmn.Config) (stopped
583583
return
584584
}
585585

586-
k.statsT.IncErr(stats.ErrKaliveCount)
586+
k.statsT.Inc(stats.ErrKaliveCount)
587587

588588
debug.Assert(cpid == pid && cpid != si.ID())
589589
nlog.Warningln(sname, "=>", pname, "failure - retrying: [", fast, tout, err, status, "]")

ais/proxy.go

Lines changed: 26 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -680,7 +680,7 @@ func (p *proxy) httpbckget(w http.ResponseWriter, r *http.Request, dpq *dpq) {
680680
}
681681
lsmsg.Prefix = cos.TrimPrefix(lsmsg.Prefix)
682682
if err := cmn.ValidatePrefix("bad list-objects request", lsmsg.Prefix); err != nil {
683-
p.statsT.IncErr(stats.ErrListCount)
683+
p.statsT.IncBck(stats.ErrListCount, bck.Bucket())
684684
p.writeErr(w, r, err)
685685
return
686686
}
@@ -698,7 +698,6 @@ func (p *proxy) httpbckget(w http.ResponseWriter, r *http.Request, dpq *dpq) {
698698
// do
699699
bck, errN := bckArgs.initAndTry()
700700
if errN != nil {
701-
p.statsT.IncErr(stats.ErrListCount)
702701
return
703702
}
704703
p.listObjects(w, r, bck, msg /*amsg*/, &lsmsg)
@@ -733,11 +732,11 @@ func (p *proxy) httpobjget(w http.ResponseWriter, r *http.Request, origURLBck ..
733732
objName := apireq.items[1]
734733
apiReqFree(apireq)
735734
if err != nil {
736-
p.statsT.IncErr(stats.ErrGetCount)
737735
return
738736
}
737+
739738
if err := cmn.ValidOname(objName); err != nil {
740-
p.statsT.IncErr(stats.ErrGetCount)
739+
p.statsT.IncBck(stats.ErrGetCount, bck.Bucket())
741740
p.writeErr(w, r, err)
742741
return
743742
}
@@ -748,7 +747,7 @@ func (p *proxy) httpobjget(w http.ResponseWriter, r *http.Request, origURLBck ..
748747
smap := p.owner.smap.get()
749748
tsi, netPub, err := smap.HrwMultiHome(bck.MakeUname(objName))
750749
if err != nil {
751-
p.statsT.IncErr(stats.ErrGetCount)
750+
p.statsT.IncBck(stats.ErrGetCount, bck.Bucket())
752751
p.writeErr(w, r, err)
753752
return
754753
}
@@ -760,7 +759,7 @@ func (p *proxy) httpobjget(w http.ResponseWriter, r *http.Request, origURLBck ..
760759
http.Redirect(w, r, redirectURL, http.StatusMovedPermanently)
761760

762761
// 4. stats
763-
p.statsT.Inc(stats.GetCount)
762+
p.statsT.IncBck(stats.GetCount, bck.Bucket())
764763
}
765764

766765
// PUT /v1/objects/bucket-name/object-name
@@ -771,6 +770,7 @@ func (p *proxy) httpobjput(w http.ResponseWriter, r *http.Request, apireq *apiRe
771770
errcnt = stats.ErrPutCount
772771
scnt = stats.PutCount
773772
perms = apc.AcePUT
773+
vlabs = map[string]string{stats.VarlabBucket: "", stats.VarlabXactKind: "", stats.VarlabXactID: ""}
774774
)
775775
// 1. request
776776
if err := p.parseReq(w, r, apireq); err != nil {
@@ -782,6 +782,7 @@ func (p *proxy) httpobjput(w http.ResponseWriter, r *http.Request, apireq *apiRe
782782
perms = apc.AceAPPEND
783783
errcnt = stats.ErrAppendCount
784784
scnt = stats.AppendCount
785+
vlabs = map[string]string{stats.VarlabBucket: ""}
785786
if apireq.dpq.apnd.hdl != "" {
786787
items, err := preParse(apireq.dpq.apnd.hdl) // apc.QparamAppendHandle
787788
if err != nil {
@@ -805,9 +806,9 @@ func (p *proxy) httpobjput(w http.ResponseWriter, r *http.Request, apireq *apiRe
805806
bck, err := bckArgs.initAndTry()
806807
freeBctx(bckArgs)
807808
if err != nil {
808-
p.statsT.IncErr(errcnt)
809809
return
810810
}
811+
vlabs[stats.VarlabBucket] = bck.Cname("")
811812

812813
// 3. redirect
813814
var (
@@ -818,20 +819,20 @@ func (p *proxy) httpobjput(w http.ResponseWriter, r *http.Request, apireq *apiRe
818819
netPub = cmn.NetPublic
819820
)
820821
if err := cmn.ValidOname(objName); err != nil {
821-
p.statsT.IncErr(errcnt)
822+
p.statsT.IncWith(errcnt, vlabs)
822823
p.writeErr(w, r, err)
823824
return
824825
}
825826
if nodeID == "" {
826827
tsi, netPub, err = smap.HrwMultiHome(bck.MakeUname(objName))
827828
if err != nil {
828-
p.statsT.IncErr(errcnt)
829+
p.statsT.IncWith(errcnt, vlabs)
829830
p.writeErr(w, r, err)
830831
return
831832
}
832833
} else {
833834
if tsi = smap.GetTarget(nodeID); tsi == nil {
834-
p.statsT.IncErr(errcnt)
835+
p.statsT.IncWith(errcnt, vlabs)
835836
err = &errNodeNotFound{p.si, smap, verb + " failure:", nodeID}
836837
p.writeErr(w, r, err)
837838
return
@@ -851,7 +852,7 @@ func (p *proxy) httpobjput(w http.ResponseWriter, r *http.Request, apireq *apiRe
851852
http.Redirect(w, r, redirectURL, http.StatusTemporaryRedirect)
852853

853854
// 4. stats
854-
p.statsT.Inc(scnt)
855+
p.statsT.IncWith(scnt, vlabs)
855856
}
856857

857858
// DELETE /v1/objects/bucket-name/object-name
@@ -869,14 +870,14 @@ func (p *proxy) httpobjdelete(w http.ResponseWriter, r *http.Request) {
869870
return
870871
}
871872
if err := cmn.ValidOname(objName); err != nil {
10000
872-
p.statsT.IncErr(stats.ErrDeleteCount)
873+
p.statsT.IncBck(stats.ErrDeleteCount, bck.Bucket())
873874
p.writeErr(w, r, err)
874875
return
875876
}
876877
smap := p.owner.smap.get()
877878
tsi, err := smap.HrwName2T(bck.MakeUname(objName))
878879
if err != nil {
879-
p.statsT.IncErr(stats.ErrDeleteCount)
880+
p.statsT.IncBck(stats.ErrDeleteCount, bck.Bucket())
880881
p.writeErr(w, r, err)
881882
return
882883
}
@@ -886,7 +887,7 @@ func (p *proxy) httpobjdelete(w http.ResponseWriter, r *http.Request) {
886887
redirectURL := p.redirectURL(r, tsi, time.Now() /*started*/, cmn.NetIntraControl)
887888
http.Redirect(w, r, redirectURL, http.StatusTemporaryRedirect)
888889

889-
p.statsT.Inc(stats.DeleteCount)
890+
p.statsT.IncBck(stats.DeleteCount, bck.Bucket())
890891
}
891892

892893
// DELETE { action } /v1/buckets
@@ -1643,7 +1644,7 @@ func (p *proxy) listObjects(w http.ResponseWriter, r *http.Request, bck *meta.Bc
16431644
// LsVerChanged a.k.a. '--check-versions' limitations
16441645
if lsmsg.IsFlagSet(apc.LsVerChanged) {
16451646
if err := _checkVerChanged(bck, lsmsg); err != nil {
1646-
p.statsT.IncErr(stats.ErrListCount)
1647+
p.statsT.IncBck(stats.ErrListCount, bck.Bucket())
16471648
p.writeErr(w, r, err)
16481649
return
16491650
}
@@ -1671,13 +1672,15 @@ func (p *proxy) listObjects(w http.ResponseWriter, r *http.Request, bck *meta.Bc
16711672
beg := mono.NanoTime()
16721673
lst, err := p.lsPage(bck, amsg, lsmsg, r.Header, p.owner.smap.get())
16731674
if err != nil {
1674-
p.statsT.IncErr(stats.ErrListCount)
1675+
p.statsT.IncBck(stats.ErrListCount, bck.Bucket())
16751676
p.writeErr(w, r, err)
16761677
return
16771678
}
1678-
p.statsT.AddMany(
1679-
cos.NamedVal64{Name: stats.ListCount, Value: 1},
1680-
cos.NamedVal64{Name: stats.ListLatency, Value: mono.SinceNano(beg)},
1679+
1680+
vlabs := map[string]string{stats.VarlabBucket: bck.Cname("")}
1681+
p.statsT.AddWith(
1682+
cos.NamedVal64{Name: stats.ListCount, Value: 1, VarLabs: vlabs},
1683+
cos.NamedVal64{Name: stats.ListLatency, Value: mono.SinceNano(beg), VarLabs: vlabs},
16811684
)
16821685

16831686
var ok bool
@@ -1846,18 +1849,18 @@ func (p *proxy) httpobjpost(w http.ResponseWriter, r *http.Request, apireq *apiR
18461849
switch msg.Action {
18471850
case apc.ActRenameObject:
18481851
if err := p.checkAccess(w, r, bck, apc.AceObjMOVE); err != nil {
1849-
p.statsT.IncErr(stats.ErrRenameCount)
1852+
p.statsT.IncBck(stats.ErrRenameCount, bck.Bucket())
18501853
return
18511854
}
18521855
if err := _checkObjMv(bck, msg, apireq); err != nil {
1853-
p.statsT.IncErr(stats.ErrRenameCount)
1856+
p.statsT.IncBck(stats.ErrRenameCount, bck.Bucket())
18541857
p.writeErr(w, r, err)
18551858
}
18561859
p.redirectAction(w, r, bck, apireq.items[1], msg)
1857-
p.statsT.Inc(stats.RenameCount)
1860+
p.statsT.IncBck(stats.RenameCount, bck.Bucket())
18581861
case apc.ActPromote:
18591862
if err := p.checkAccess(w, r, bck, apc.AcePromote); err != nil {
1860-
p.statsT.IncErr(stats.ErrRenameCount)
1863+
p.statsT.IncBck(stats.ErrRenameCount, bck.Bucket())
18611864
return
18621865
}
18631866
// ActionMsg.Name is the source

ais/prxbck.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,7 @@ func (bctx *bctx) initAndTry() (bck *meta.Bck, err error) {
196196
}
197197
if ecode != http.StatusNotFound {
198198
// user GET and PUT requests: making a _silent_ exception for assorted error codes
199-
// (counting them via stats.IncErr though)
199+
// (counting them via stats.Inc though)
200200
if bctx.perms == apc.AceGET || bctx.perms == apc.AcePUT {
201201
if ecode == http.StatusUnauthorized || ecode == http.StatusForbidden {
202202
bctx.p.writeErr(bctx.w, bctx.r, err, ecode, Silent)

ais/prxs3.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -413,9 +413,11 @@ func (p *proxy) lsAllPagesS3(bck *meta.Bck, amsg *apc.ActMsg, lsmsg *apc.LsoMsg,
413413
if err != nil {
414414
return lst, err
415415
}
416-
p.statsT.AddMany(
417-
cos.NamedVal64{Name: stats.ListCount, Value: 1},
418-
cos.NamedVal64{Name: stats.ListLatency, Value: mono.SinceNano(beg)},
416+
417+
vlabs := map[string]string{stats.VarlabBucket: bck.Cname("")}
418+
p.statsT.AddWith(
419+
cos.NamedVal64{Name: stats.ListCount, Value: 1, VarLabs: vlabs},
420+
cos.NamedVal64{Name: stats.ListLatency, Value: mono.SinceNano(beg), VarLabs: vlabs},
419421
)
420422
if pageNum == 1 {
421423
lst = page

0 commit comments

Comments
 (0)
0