github.com/segmentio/kafka-go@v0.4.48-0.20240318174348-3f6244eb34fd/groupbalancer.go (about)

     1  package kafka
     2  
     3  import (
     4  	"sort"
     5  )
     6  
     7  // GroupMember describes a single participant in a consumer group.
     8  type GroupMember struct {
     9  	// ID is the unique ID for this member as taken from the JoinGroup response.
    10  	ID string
    11  
    12  	// Topics is a list of topics that this member is consuming.
    13  	Topics []string
    14  
    15  	// UserData contains any information that the GroupBalancer sent to the
    16  	// consumer group coordinator.
    17  	UserData []byte
    18  }
    19  
    20  // GroupMemberAssignments holds MemberID => topic => partitions.
    21  type GroupMemberAssignments map[string]map[string][]int
    22  
    23  // GroupBalancer encapsulates the client side rebalancing logic.
    24  type GroupBalancer interface {
    25  	// ProtocolName of the GroupBalancer
    26  	ProtocolName() string
    27  
    28  	// UserData provides the GroupBalancer an opportunity to embed custom
    29  	// UserData into the metadata.
    30  	//
    31  	// Will be used by JoinGroup to begin the consumer group handshake.
    32  	//
    33  	// See https://cwiki.apache.org/confluence/display/KAFKA/A+Guide+To+The+Kafka+Protocol#AGuideToTheKafkaProtocol-JoinGroupRequest
    34  	UserData() ([]byte, error)
    35  
    36  	// DefineMemberships returns which members will be consuming
    37  	// which topic partitions
    38  	AssignGroups(members []GroupMember, partitions []Partition) GroupMemberAssignments
    39  }
    40  
    41  // RangeGroupBalancer groups consumers by partition
    42  //
    43  // Example: 5 partitions, 2 consumers
    44  // 		C0: [0, 1, 2]
    45  // 		C1: [3, 4]
    46  //
    47  // Example: 6 partitions, 3 consumers
    48  // 		C0: [0, 1]
    49  // 		C1: [2, 3]
    50  // 		C2: [4, 5]
    51  //
    52  type RangeGroupBalancer struct{}
    53  
    54  func (r RangeGroupBalancer) ProtocolName() string {
    55  	return "range"
    56  }
    57  
    58  func (r RangeGroupBalancer) UserData() ([]byte, error) {
    59  	return nil, nil
    60  }
    61  
    62  func (r RangeGroupBalancer) AssignGroups(members []GroupMember, topicPartitions []Partition) GroupMemberAssignments {
    63  	groupAssignments := GroupMemberAssignments{}
    64  	membersByTopic := findMembersByTopic(members)
    65  
    66  	for topic, members := range membersByTopic {
    67  		partitions := findPartitions(topic, topicPartitions)
    68  		partitionCount := len(partitions)
    69  		memberCount := len(members)
    70  
    71  		for memberIndex, member := range members {
    72  			assignmentsByTopic, ok := groupAssignments[member.ID]
    73  			if !ok {
    74  				assignmentsByTopic = map[string][]int{}
    75  				groupAssignments[member.ID] = assignmentsByTopic
    76  			}
    77  
    78  			minIndex := memberIndex * partitionCount / memberCount
    79  			maxIndex := (memberIndex + 1) * partitionCount / memberCount
    80  
    81  			for partitionIndex, partition := range partitions {
    82  				if partitionIndex >= minIndex && partitionIndex < maxIndex {
    83  					assignmentsByTopic[topic] = append(assignmentsByTopic[topic], partition)
    84  				}
    85  			}
    86  		}
    87  	}
    88  
    89  	return groupAssignments
    90  }
    91  
    92  // RoundrobinGroupBalancer divides partitions evenly among consumers
    93  //
    94  // Example: 5 partitions, 2 consumers
    95  // 		C0: [0, 2, 4]
    96  // 		C1: [1, 3]
    97  //
    98  // Example: 6 partitions, 3 consumers
    99  // 		C0: [0, 3]
   100  // 		C1: [1, 4]
   101  // 		C2: [2, 5]
   102  //
   103  type RoundRobinGroupBalancer struct{}
   104  
   105  func (r RoundRobinGroupBalancer) ProtocolName() string {
   106  	return "roundrobin"
   107  }
   108  
   109  func (r RoundRobinGroupBalancer) UserData() ([]byte, error) {
   110  	return nil, nil
   111  }
   112  
   113  func (r RoundRobinGroupBalancer) AssignGroups(members []GroupMember, topicPartitions []Partition) GroupMemberAssignments {
   114  	groupAssignments := GroupMemberAssignments{}
   115  	membersByTopic := findMembersByTopic(members)
   116  	for topic, members := range membersByTopic {
   117  		partitionIDs := findPartitions(topic, topicPartitions)
   118  		memberCount := len(members)
   119  
   120  		for memberIndex, member := range members {
   121  			assignmentsByTopic, ok := groupAssignments[member.ID]
   122  			if !ok {
   123  				assignmentsByTopic = map[string][]int{}
   124  				groupAssignments[member.ID] = assignmentsByTopic
   125  			}
   126  
   127  			for partitionIndex, partition := range partitionIDs {
   128  				if (partitionIndex % memberCount) == memberIndex {
   129  					assignmentsByTopic[topic] = append(assignmentsByTopic[topic], partition)
   130  				}
   131  			}
   132  		}
   133  	}
   134  
   135  	return groupAssignments
   136  }
   137  
   138  // RackAffinityGroupBalancer makes a best effort to pair up consumers with
   139  // partitions whose leader is in the same rack.  This strategy can have
   140  // performance benefits by minimizing round trip latency between the consumer
   141  // and the broker.  In environments where network traffic across racks incurs
   142  // charges (such as cross AZ data transfer in AWS), this strategy is also a cost
   143  // optimization measure because it keeps network traffic within the local rack
   144  // where possible.
   145  //
   146  // The primary objective is to spread partitions evenly across consumers with a
   147  // secondary focus on maximizing the number of partitions where the leader and
   148  // the consumer are in the same rack.  For best affinity, it's recommended to
   149  // have a balanced spread of consumers and partition leaders across racks.
   150  //
   151  // This balancer requires Kafka version 0.10.0.0+ or later.  Earlier versions do
   152  // not return the brokers' racks in the metadata request.
   153  type RackAffinityGroupBalancer struct {
   154  	// Rack is the name of the rack where this consumer is running.  It will be
   155  	// communicated to the consumer group leader via the UserData so that
   156  	// assignments can be made with affinity to the partition leader.
   157  	Rack string
   158  }
   159  
   160  func (r RackAffinityGroupBalancer) ProtocolName() string {
   161  	return "rack-affinity"
   162  }
   163  
   164  func (r RackAffinityGroupBalancer) AssignGroups(members []GroupMember, partitions []Partition) GroupMemberAssignments {
   165  	membersByTopic := make(map[string][]GroupMember)
   166  	for _, m := range members {
   167  		for _, t := range m.Topics {
   168  			membersByTopic[t] = append(membersByTopic[t], m)
   169  		}
   170  	}
   171  
   172  	partitionsByTopic := make(map[string][]Partition)
   173  	for _, p := range partitions {
   174  		partitionsByTopic[p.Topic] = append(partitionsByTopic[p.Topic], p)
   175  	}
   176  
   177  	assignments := GroupMemberAssignments{}
   178  	for topic := range membersByTopic {
   179  		topicAssignments := r.assignTopic(membersByTopic[topic], partitionsByTopic[topic])
   180  		for member, parts := range topicAssignments {
   181  			memberAssignments, ok := assignments[member]
   182  			if !ok {
   183  				memberAssignments = make(map[string][]int)
   184  				assignments[member] = memberAssignments
   185  			}
   186  			memberAssignments[topic] = parts
   187  		}
   188  	}
   189  	return assignments
   190  }
   191  
   192  func (r RackAffinityGroupBalancer) UserData() ([]byte, error) {
   193  	return []byte(r.Rack), nil
   194  }
   195  
   196  func (r *RackAffinityGroupBalancer) assignTopic(members []GroupMember, partitions []Partition) map[string][]int {
   197  	zonedPartitions := make(map[string][]int)
   198  	for _, part := range partitions {
   199  		zone := part.Leader.Rack
   200  		zonedPartitions[zone] = append(zonedPartitions[zone], part.ID)
   201  	}
   202  
   203  	zonedConsumers := make(map[string][]string)
   204  	for _, member := range members {
   205  		zone := string(member.UserData)
   206  		zonedConsumers[zone] = append(zonedConsumers[zone], member.ID)
   207  	}
   208  
   209  	targetPerMember := len(partitions) / len(members)
   210  	remainder := len(partitions) % len(members)
   211  	assignments := make(map[string][]int)
   212  
   213  	// assign as many as possible in zone.  this will assign up to partsPerMember
   214  	// to each consumer.  it will also prefer to allocate remainder partitions
   215  	// in zone if possible.
   216  	for zone, parts := range zonedPartitions {
   217  		consumers := zonedConsumers[zone]
   218  		if len(consumers) == 0 {
   219  			continue
   220  		}
   221  
   222  		// don't over-allocate.  cap partition assignments at the calculated
   223  		// target.
   224  		partsPerMember := len(parts) / len(consumers)
   225  		if partsPerMember > targetPerMember {
   226  			partsPerMember = targetPerMember
   227  		}
   228  
   229  		for _, consumer := range consumers {
   230  			assignments[consumer] = append(assignments[consumer], parts[:partsPerMember]...)
   231  			parts = parts[partsPerMember:]
   232  		}
   233  
   234  		// if we had enough partitions for each consumer in this zone to hit its
   235  		// target, attempt to use any leftover partitions to satisfy the total
   236  		// remainder by adding at most 1 partition per consumer.
   237  		leftover := len(parts)
   238  		if partsPerMember == targetPerMember {
   239  			if leftover > remainder {
   240  				leftover = remainder
   241  			}
   242  			if leftover > len(consumers) {
   243  				leftover = len(consumers)
   244  			}
   245  			remainder -= leftover
   246  		}
   247  
   248  		// this loop covers the case where we're assigning extra partitions or
   249  		// if there weren't enough to satisfy the targetPerMember and the zoned
   250  		// partitions didn't divide evenly.
   251  		for i := 0; i < leftover; i++ {
   252  			assignments[consumers[i]] = append(assignments[consumers[i]], parts[i])
   253  		}
   254  		parts = parts[leftover:]
   255  
   256  		if len(parts) == 0 {
   257  			delete(zonedPartitions, zone)
   258  		} else {
   259  			zonedPartitions[zone] = parts
   260  		}
   261  	}
   262  
   263  	// assign out remainders regardless of zone.
   264  	var remaining []int
   265  	for _, partitions := range zonedPartitions {
   266  		remaining = append(remaining, partitions...)
   267  	}
   268  
   269  	for _, member := range members {
   270  		assigned := assignments[member.ID]
   271  		delta := targetPerMember - len(assigned)
   272  		// if it were possible to assign the remainder in zone, it's been taken
   273  		// care of already.  now we will portion out any remainder to a member
   274  		// that can take it.
   275  		if delta >= 0 && remainder > 0 {
   276  			delta++
   277  			remainder--
   278  		}
   279  		if delta > 0 {
   280  			assignments[member.ID] = append(assigned, remaining[:delta]...)
   281  			remaining = remaining[delta:]
   282  		}
   283  	}
   284  
   285  	return assignments
   286  }
   287  
   288  // findPartitions extracts the partition ids associated with the topic from the
   289  // list of Partitions provided.
   290  func findPartitions(topic string, partitions []Partition) []int {
   291  	var ids []int
   292  	for _, partition := range partitions {
   293  		if partition.Topic == topic {
   294  			ids = append(ids, partition.ID)
   295  		}
   296  	}
   297  	return ids
   298  }
   299  
   300  // findMembersByTopic groups the memberGroupMetadata by topic.
   301  func findMembersByTopic(members []GroupMember) map[string][]GroupMember {
   302  	membersByTopic := map[string][]GroupMember{}
   303  	for _, member := range members {
   304  		for _, topic := range member.Topics {
   305  			membersByTopic[topic] = append(membersByTopic[topic], member)
   306  		}
   307  	}
   308  
   309  	// normalize ordering of members to enabling grouping across topics by partitions
   310  	//
   311  	// Want:
   312  	// 		C0 [T0/P0, T1/P0]
   313  	// 		C1 [T0/P1, T1/P1]
   314  	//
   315  	// Not:
   316  	// 		C0 [T0/P0, T1/P1]
   317  	// 		C1 [T0/P1, T1/P0]
   318  	//
   319  	// Even though the later is still round robin, the partitions are crossed
   320  	//
   321  	for _, members := range membersByTopic {
   322  		sort.Slice(members, func(i, j int) bool {
   323  			return members[i].ID < members[j].ID
   324  		})
   325  	}
   326  
   327  	return membersByTopic
   328  }
   329  
   330  // findGroupBalancer returns the GroupBalancer with the specified protocolName
   331  // from the slice provided.
   332  func findGroupBalancer(protocolName string, balancers []GroupBalancer) (GroupBalancer, bool) {
   333  	for _, balancer := range balancers {
   334  		if balancer.ProtocolName() == protocolName {
   335  			return balancer, true
   336  		}
   337  	}
   338  	return nil, false
   339  }