github.com/netdata/go.d.plugin@v0.58.1/modules/hdfs/metrics.go (about)

     1  // SPDX-License-Identifier: GPL-3.0-or-later
     2  
     3  package hdfs
     4  
     5  // HDFS Architecture
     6  // https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html#NameNode+and+DataNodes
     7  
     8  // Metrics description
     9  // https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/Metrics.html
    10  
    11  // Good article
    12  // https://www.datadoghq.com/blog/monitor-hadoop-metrics/#hdfs-metrics
    13  
    14  type metrics struct {
    15  	Jvm              *jvmMetrics              `stm:"jvm"`  // both
    16  	Rpc              *rpcActivityMetrics      `stm:"rpc"`  // both
    17  	FSNameSystem     *fsNameSystemMetrics     `stm:"fsns"` // namenode
    18  	FSDatasetState   *fsDatasetStateMetrics   `stm:"fsds"` // datanode
    19  	DataNodeActivity *dataNodeActivityMetrics `stm:"dna"`  // datanode
    20  }
    21  
    22  type jvmMetrics struct {
    23  	ProcessName string `json:"tag.ProcessName"`
    24  	HostName    string `json:"tag.Hostname"`
    25  	//MemNonHeapUsedM            float64 `stm:"mem_non_heap_used,1000,1"`
    26  	//MemNonHeapCommittedM       float64 `stm:"mem_non_heap_committed,1000,1"`
    27  	//MemNonHeapMaxM             float64 `stm:"mem_non_heap_max"`
    28  	MemHeapUsedM      float64 `stm:"mem_heap_used,1000,1"`
    29  	MemHeapCommittedM float64 `stm:"mem_heap_committed,1000,1"`
    30  	MemHeapMaxM       float64 `stm:"mem_heap_max"`
    31  	//MemMaxM                    float64 `stm:"mem_max"`
    32  	GcCount                    float64 `stm:"gc_count"`
    33  	GcTimeMillis               float64 `stm:"gc_time_millis"`
    34  	GcNumWarnThresholdExceeded float64 `stm:"gc_num_warn_threshold_exceeded"`
    35  	GcNumInfoThresholdExceeded float64 `stm:"gc_num_info_threshold_exceeded"`
    36  	GcTotalExtraSleepTime      float64 `stm:"gc_total_extra_sleep_time"`
    37  	ThreadsNew                 float64 `stm:"threads_new"`
    38  	ThreadsRunnable            float64 `stm:"threads_runnable"`
    39  	ThreadsBlocked             float64 `stm:"threads_blocked"`
    40  	ThreadsWaiting             float64 `stm:"threads_waiting"`
    41  	ThreadsTimedWaiting        float64 `stm:"threads_timed_waiting"`
    42  	ThreadsTerminated          float64 `stm:"threads_terminated"`
    43  	LogFatal                   float64 `stm:"log_fatal"`
    44  	LogError                   float64 `stm:"log_error"`
    45  	LogWarn                    float64 `stm:"log_warn"`
    46  	LogInfo                    float64 `stm:"log_info"`
    47  }
    48  
    49  type rpcActivityMetrics struct {
    50  	ReceivedBytes       float64 `stm:"received_bytes"`
    51  	SentBytes           float64 `stm:"sent_bytes"`
    52  	RpcQueueTimeNumOps  float64 `stm:"queue_time_num_ops"`
    53  	RpcQueueTimeAvgTime float64 `stm:"queue_time_avg_time,1000,1"`
    54  	//RpcProcessingTimeNumOps  float64
    55  	RpcProcessingTimeAvgTime float64 `stm:"processing_time_avg_time,1000,1"`
    56  	//DeferredRpcProcessingTimeNumOps  float64
    57  	//DeferredRpcProcessingTimeAvgTime float64
    58  	//RpcAuthenticationFailures        float64
    59  	//RpcAuthenticationSuccesses       float64
    60  	//RpcAuthorizationFailures         float64
    61  	//RpcAuthorizationSuccesses        float64
    62  	//RpcClientBackoff                 float64
    63  	//RpcSlowCalls                     float64
    64  	NumOpenConnections float64 `stm:"num_open_connections"`
    65  	CallQueueLength    float64 `stm:"call_queue_length"`
    66  	//NumDroppedConnections            float64
    67  }
    68  
    69  type fsNameSystemMetrics struct {
    70  	HostName string `json:"tag.Hostname"`
    71  	HAState  string `json:"tag.HAState"`
    72  	//TotalSyncTimes                               float64 `json:"tag.tag.TotalSyncTimes"`
    73  	MissingBlocks float64 `stm:"missing_blocks"`
    74  	//MissingReplOneBlocks                         float64 `stm:"missing_repl_one_blocks"`
    75  	//ExpiredHeartbeats                            float64 `stm:"expired_heartbeats"`
    76  	//TransactionsSinceLastCheckpoint              float64 `stm:"transactions_since_last_checkpoint"`
    77  	//TransactionsSinceLastLogRoll                 float64 `stm:"transactions_since_last_log_roll"`
    78  	//LastWrittenTransactionId                     float64 `stm:"last_written_transaction_id"`
    79  	//LastCheckpointTime                           float64 `stm:"last_checkpoint_time"`
    80  	CapacityTotal float64 `stm:"capacity_total"`
    81  	//CapacityTotalGB                              float64 `stm:"capacity_total_gb"`
    82  	CapacityDfsUsed float64 `json:"CapacityUsed" stm:"capacity_used_dfs"`
    83  	//CapacityUsedGB                               float64 `stm:"capacity_used_gb"`
    84  	CapacityRemaining float64 `stm:"capacity_remaining"`
    85  	//ProvidedCapacityTotal                        float64 `stm:"provided_capacity_total"`
    86  	//CapacityRemainingGB                          float64 `stm:"capacity_remaining_gb"`
    87  	CapacityUsedNonDFS float64 `stm:"capacity_used_non_dfs"`
    88  	TotalLoad          float64 `stm:"total_load"`
    89  	//SnapshottableDirectories                     float64 `stm:"snapshottable_directories"`
    90  	//Snapshots                                    float64 `stm:"snapshots"`
    91  	//NumEncryptionZones                           float64 `stm:"num_encryption_zones"`
    92  	//LockQueueLength                              float64 `stm:"lock_queue_length"`
    93  	BlocksTotal float64 `stm:"blocks_total"`
    94  	//NumFilesUnderConstruction                    float64 `stm:"num_files_under_construction"`
    95  	//NumActiveClients                             float64 `stm:"num_active_clients"`
    96  	FilesTotal float64 `stm:"files_total"`
    97  	//PendingReplicationBlocks    float64 `stm:"pending_replication_blocks"`
    98  	//PendingReconstructionBlocks float64 `stm:"pending_reconstruction_blocks"`
    99  	UnderReplicatedBlocks float64 `stm:"under_replicated_blocks"`
   100  	//LowRedundancyBlocks                          float64 `stm:"low_redundancy_blocks"`
   101  	CorruptBlocks float64 `stm:"corrupt_blocks"`
   102  	//ScheduledReplicationBlocks float64 `stm:"scheduled_replication_blocks"`
   103  	//PendingDeletionBlocks      float64 `stm:"pending_deletion_blocks"`
   104  	//LowRedundancyReplicatedBlocks                float64 `stm:"low_redundancy_replicated_blocks"`
   105  	//CorruptReplicatedBlocks                      float64 `stm:"corrupt_replicated_blocks"`
   106  	//MissingReplicatedBlocks                      float64 `stm:"missing_replicated_blocks"`
   107  	//MissingReplicationOneBlocks                  float64 `stm:"missing_replication_one_blocks"`
   108  	//HighestPriorityLowRedundancyReplicatedBlocks float64 `stm:"highest_priority_low_redundancy_replicated_blocks"`
   109  	//HighestPriorityLowRedundancyECBlocks         float64 `stm:"highest_priority_low_redundancy_ec_blocks"`
   110  	//BytesInFutureReplicatedBlocks                float64 `stm:"bytes_in_future_replicated_blocks"`
   111  	//PendingDeletionReplicatedBlocks              float64 `stm:"pending_deletion_replicated_blocks"`
   112  	//TotalReplicatedBlocks                        float64 `stm:"total_replicated_blocks"`
   113  	//LowRedundancyECBlockGroups                   float64 `stm:"low_redundancy_ec_block_groups"`
   114  	//CorruptECBlockGroups                         float64 `stm:"corrupt_ec_block_groups"`
   115  	//MissingECBlockGroups                         float64 `stm:"missing_ec_block_groups"`
   116  	//BytesInFutureECBlockGroups                   float64 `stm:"bytes_in_future_ec_block_groups"`
   117  	//PendingDeletionECBlocks                      float64 `stm:"pending_deletion_ec_blocks"`
   118  	//TotalECBlockGroups                           float64 `stm:"total_ec_block_groups"`
   119  	//ExcessBlocks                                 float64 `stm:"excess_blocks"`
   120  	//NumTimedOutPendingReconstructions            float64 `stm:"num_timed_out_pending_reconstructions"`
   121  	//PostponedMisreplicatedBlocks                 float64 `stm:"postponed_misreplicated_blocks"`
   122  	//PendingDataNodeMessageCount                  float64 `stm:"pending_data_node_message_count"`
   123  	//MillisSinceLastLoadedEdits                   float64 `stm:"millis_since_last_loaded_edits"`
   124  	//BlockCapacity                                float64 `stm:"block_capacity"`
   125  	NumLiveDataNodes float64 `stm:"num_live_data_nodes"`
   126  	NumDeadDataNodes float64 `stm:"num_dead_data_nodes"`
   127  	//NumDecomLiveDataNodes                        float64 `stm:"num_decom_live_data_nodes"`
   128  	//NumDecomDeadDataNodes                        float64 `stm:"num_decom_dead_data_nodes"`
   129  	VolumeFailuresTotal float64 `stm:"volume_failures_total"`
   130  	//EstimatedCapacityLostTotal                   float64 `stm:"estimated_capacity_lost_total"`
   131  	//NumDecommissioningDataNodes                  float64 `stm:"num_decommissioning_data_nodes"`
   132  	StaleDataNodes float64 `stm:"stale_data_nodes"`
   133  	//NumStaleStorages                             float64 `stm:"num_stale_storages"`
   134  	//TotalSyncCount                               float64 `stm:"total_sync_count"`
   135  	//NumInMaintenanceLiveDataNodes                float64 `stm:"num_in_maintenance_live_data_nodes"`
   136  	//NumInMaintenanceDeadDataNodes                float64 `stm:"num_in_maintenance_dead_data_nodes"`
   137  	//NumEnteringMaintenanceDataNodes              float64 `stm:"num_entering_maintenance_data_nodes"`
   138  
   139  	// custom attributes
   140  	CapacityUsed float64 `json:"-" stm:"capacity_used"`
   141  }
   142  
   143  type fsDatasetStateMetrics struct {
   144  	HostName         string  `json:"tag.Hostname"`
   145  	Capacity         float64 `stm:"capacity_total"`
   146  	DfsUsed          float64 `stm:"capacity_used_dfs"`
   147  	Remaining        float64 `stm:"capacity_remaining"`
   148  	NumFailedVolumes float64 `stm:"num_failed_volumes"`
   149  	//LastVolumeFailureDate      float64 `stm:"LastVolumeFailureDate"`
   150  	//EstimatedCapacityLostTotal float64 `stm:"EstimatedCapacityLostTotal"`
   151  	//CacheUsed                  float64 `stm:"CacheUsed"`
   152  	//CacheCapacity              float64 `stm:"CacheCapacity"`
   153  	//NumBlocksCached            float64 `stm:"NumBlocksCached"`
   154  	//NumBlocksFailedToCache     float64 `stm:"NumBlocksFailedToCache"`
   155  	//NumBlocksFailedToUnCache   float64 `stm:"NumBlocksFailedToUnCache"`
   156  
   157  	// custom attributes
   158  	CapacityUsedNonDFS float64 `stm:"capacity_used_non_dfs"`
   159  	CapacityUsed       float64 `stm:"capacity_used"`
   160  }
   161  
   162  type dataNodeActivityMetrics struct {
   163  	HostName     string  `json:"tag.Hostname"`
   164  	BytesWritten float64 `stm:"bytes_written"`
   165  	//TotalWriteTime                             float64
   166  	BytesRead float64 `stm:"bytes_read"`
   167  	//TotalReadTime                              float64
   168  	//BlocksWritten float64
   169  	//BlocksRead    float64
   170  	//BlocksReplicated                           float64
   171  	//BlocksRemoved                              float64
   172  	//BlocksVerified                             float64
   173  	//BlockVerificationFailures                  float64
   174  	//BlocksCached                               float64
   175  	//BlocksUncached                             float64
   176  	//ReadsFromLocalClient                       float64
   177  	//ReadsFromRemoteClient                      float64
   178  	//WritesFromLocalClient                      float64
   179  	//WritesFromRemoteClient                     float64
   180  	//BlocksGetLocalPathInfo                     float64
   181  	//RemoteBytesRead                            float64
   182  	//RemoteBytesWritten                         float64
   183  	//RamDiskBlocksWrite                         float64
   184  	//RamDiskBlocksWriteFallback                 float64
   185  	//RamDiskBytesWrite                          float64
   186  	//RamDiskBlocksReadHits                      float64
   187  	//RamDiskBlocksEvicted                       float64
   188  	//RamDiskBlocksEvictedWithoutRead            float64
   189  	//RamDiskBlocksEvictionWindowMsNumOps        float64
   190  	//RamDiskBlocksEvictionWindowMsAvgTime       float64
   191  	//RamDiskBlocksLazyPersisted                 float64
   192  	//RamDiskBlocksDeletedBeforeLazyPersisted    float64
   193  	//RamDiskBytesLazyPersisted                  float64
   194  	//RamDiskBlocksLazyPersistWindowMsNumOps     float64
   195  	//RamDiskBlocksLazyPersistWindowMsAvgTime    float64
   196  	//FsyncCount                                 float64
   197  	//VolumeFailures                             float64
   198  	//DatanodeNetworkErrors                      float64
   199  	//DataNodeActiveXceiversCount                float64
   200  	//ReadBlockOpNumOps                          float64
   201  	//ReadBlockOpAvgTime                         float64
   202  	//WriteBlockOpNumOps                         float64
   203  	//WriteBlockOpAvgTime                        float64
   204  	//BlockChecksumOpNumOps                      float64
   205  	//BlockChecksumOpAvgTime                     float64
   206  	//CopyBlockOpNumOps                          float64
   207  	//CopyBlockOpAvgTime                         float64
   208  	//ReplaceBlockOpNumOps                       float64
   209  	//ReplaceBlockOpAvgTime                      float64
   210  	//HeartbeatsNumOps                           float64
   211  	//HeartbeatsAvgTime                          float64
   212  	//HeartbeatsTotalNumOps                      float64
   213  	//HeartbeatsTotalAvgTime                     float64
   214  	//LifelinesNumOps                            float64
   215  	//LifelinesAvgTime                           float64
   216  	//BlockReportsNumOps                         float64
   217  	//BlockReportsAvgTime                        float64
   218  	//IncrementalBlockReportsNumOps              float64
   219  	//IncrementalBlockReportsAvgTime             float64
   220  	//CacheReportsNumOps                         float64
   221  	//CacheReportsAvgTime                        float64
   222  	//PacketAckRoundTripTimeNanosNumOps          float64
   223  	//PacketAckRoundTripTimeNanosAvgTime         float64
   224  	//FlushNanosNumOps                           float64
   225  	//FlushNanosAvgTime                          float64
   226  	//FsyncNanosNumOps                           float64
   227  	//FsyncNanosAvgTime                          float64
   228  	//SendDataPacketBlockedOnNetworkNanosNumOps  float64
   229  	//SendDataPacketBlockedOnNetworkNanosAvgTime float64
   230  	//SendDataPacketTransferNanosNumOps          float64
   231  	//SendDataPacketTransferNanosAvgTime         float64
   232  	//BlocksInPendingIBR                         float64
   233  	//BlocksReceivingInPendingIBR                float64
   234  	//BlocksReceivedInPendingIBR                 float64
   235  	//BlocksDeletedInPendingIBR                  float64
   236  	//EcReconstructionTasks                      float64
   237  	//EcFailedReconstructionTasks                float64
   238  	//EcDecodingTimeNanos                        float64
   239  	//EcReconstructionBytesRead                  float64
   240  	//EcReconstructionBytesWritten               float64
   241  	//EcReconstructionRemoteBytesRead            float64
   242  	//EcReconstructionReadTimeMillis             float64
   243  	//EcReconstructionDecodingTimeMillis         float64
   244  	//EcReconstructionWriteTimeMillis            float64
   245  }