From f0e6574d5ed03446b9b221653b20618c0e11b381 Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Sat, 25 May 2019 02:02:44 -0700 Subject: [PATCH] allocate ec shards to volume servers --- weed/shell/command_ec_encode.go | 154 +++++++++++++++++- weed/storage/erasure_coding/ec_encoder.go | 7 +- weed/storage/erasure_coding/ec_test.go | 4 +- weed/storage/erasure_coding/ec_volume_info.go | 13 +- weed/topology/data_node_ec.go | 4 +- weed/topology/topology_ec.go | 4 +- 6 files changed, 172 insertions(+), 14 deletions(-) diff --git a/weed/shell/command_ec_encode.go b/weed/shell/command_ec_encode.go index 80a0ccf5c..4647c2507 100644 --- a/weed/shell/command_ec_encode.go +++ b/weed/shell/command_ec_encode.go @@ -5,10 +5,15 @@ import ( "flag" "fmt" "io" + "sort" + "sync" "github.com/chrislusf/seaweedfs/weed/operation" + "github.com/chrislusf/seaweedfs/weed/pb/master_pb" "github.com/chrislusf/seaweedfs/weed/pb/volume_server_pb" + "github.com/chrislusf/seaweedfs/weed/storage/erasure_coding" "github.com/chrislusf/seaweedfs/weed/storage/needle" + "github.com/chrislusf/seaweedfs/weed/wdclient" "google.golang.org/grpc" ) @@ -53,18 +58,28 @@ func (c *commandEcEncode) Do(args []string, commandEnv *commandEnv, writer io.Wr ctx := context.Background() + // find volume location locations := commandEnv.masterClient.GetLocations(uint32(*volumeId)) - if len(locations) == 0 { return fmt.Errorf("volume %d not found", *volumeId) } - err = generateEcSlices(ctx, commandEnv.option.GrpcDialOption, needle.VolumeId(*volumeId), locations[0].Url) + // generate ec shards + err = generateEcShards(ctx, commandEnv.option.GrpcDialOption, needle.VolumeId(*volumeId), locations[0].Url) + if err != nil { + return fmt.Errorf("generate ec shards for volume %d on %s: %v", *volumeId, locations[0].Url, err) + } + + // balance the ec shards to current cluster + err = balanceEcShards(ctx, commandEnv, needle.VolumeId(*volumeId), locations[0]) + if err != nil { + return fmt.Errorf("balance ec shards for volume %d on %s: %v", *volumeId, locations[0].Url, err) + } return err } -func generateEcSlices(ctx context.Context, grpcDialOption grpc.DialOption, volumeId needle.VolumeId, sourceVolumeServer string) error { +func generateEcShards(ctx context.Context, grpcDialOption grpc.DialOption, volumeId needle.VolumeId, sourceVolumeServer string) error { err := operation.WithVolumeServerClient(sourceVolumeServer, grpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error { _, genErr := volumeServerClient.VolumeEcGenerateSlices(ctx, &volume_server_pb.VolumeEcGenerateSlicesRequest{ @@ -76,3 +91,136 @@ func generateEcSlices(ctx context.Context, grpcDialOption grpc.DialOption, volum return err } + +func balanceEcShards(ctx context.Context, commandEnv *commandEnv, volumeId needle.VolumeId, existingLocation wdclient.Location) (err error) { + + // list all possible locations + var resp *master_pb.VolumeListResponse + err = commandEnv.masterClient.WithClient(ctx, func(client master_pb.SeaweedClient) error { + resp, err = client.VolumeList(ctx, &master_pb.VolumeListRequest{}) + return err + }) + if err != nil { + return err + } + + // find out all volume servers with one volume slot left. + var allDataNodes []*master_pb.DataNodeInfo + var totalFreeEcSlots int + eachDataNode(resp.TopologyInfo, func(dn *master_pb.DataNodeInfo) { + if freeEcSlots := countFreeShardSlots(dn); freeEcSlots > 0 { + allDataNodes = append(allDataNodes, dn) + totalFreeEcSlots += freeEcSlots + } + }) + if totalFreeEcSlots < erasure_coding.TotalShardsCount { + return fmt.Errorf("not enough free ec shard slots. only %d left", totalFreeEcSlots) + } + sort.Slice(allDataNodes, func(i, j int) bool { + return countFreeShardSlots(allDataNodes[j]) < countFreeShardSlots(allDataNodes[i]) + }) + if len(allDataNodes) > erasure_coding.TotalShardsCount { + allDataNodes = allDataNodes[:erasure_coding.TotalShardsCount] + } + + // calculate how many shards to allocate for these servers + allocated := balancedEcDistribution(allDataNodes) + + // ask the data nodes to copy from the source volume server + err = parallelCopyEcShardsFromSource(ctx, commandEnv.option.GrpcDialOption, allDataNodes, allocated, volumeId, existingLocation) + if err != nil { + return nil + } + + // ask the source volume server to clean up copied ec shards + + // ask the source volume server to delete the original volume + + return err + +} + +func parallelCopyEcShardsFromSource(ctx context.Context, grpcDialOption grpc.DialOption, + targetServers []*master_pb.DataNodeInfo, allocated []int, + volumeId needle.VolumeId, existingLocation wdclient.Location) (err error) { + + // parallelize + var wg sync.WaitGroup + startFromShardId := 0 + for i, server := range targetServers { + if allocated[i] <= 0 { + continue + } + + wg.Add(1) + go func(server *master_pb.DataNodeInfo, startFromShardId int, shardCount int) { + defer wg.Done() + copyErr := oneServerCopyEcShardsFromSource(ctx, grpcDialOption, server, startFromShardId, shardCount, volumeId, existingLocation) + if copyErr != nil { + err = copyErr + } + }(server, startFromShardId, allocated[i]) + startFromShardId += allocated[i] + } + wg.Wait() + + return err +} + +func oneServerCopyEcShardsFromSource(ctx context.Context, grpcDialOption grpc.DialOption, + targetServer *master_pb.DataNodeInfo, startFromShardId int, shardCount int, + volumeId needle.VolumeId, existingLocation wdclient.Location) (err error) { + + if targetServer.Id == existingLocation.Url { + return nil + } + + for shardId := startFromShardId; shardId < startFromShardId+shardCount; shardId++ { + fmt.Printf("copy %d.%d %s => %s\n", volumeId, shardId, existingLocation.Url, targetServer.Id) + } + + return nil +} +func balancedEcDistribution(servers []*master_pb.DataNodeInfo) (allocated []int) { + freeSlots := make([]int, len(servers)) + allocated = make([]int, len(servers)) + for i, server := range servers { + freeSlots[i] = countFreeShardSlots(server) + } + allocatedCount := 0 + for allocatedCount < erasure_coding.TotalShardsCount { + for i, _ := range servers { + if freeSlots[i]-allocated[i] > 0 { + allocated[i] += 1 + allocatedCount += 1 + } + if allocatedCount >= erasure_coding.TotalShardsCount { + break + } + } + } + + return allocated +} + +func eachDataNode(topo *master_pb.TopologyInfo, fn func(*master_pb.DataNodeInfo)) { + for _, dc := range topo.DataCenterInfos { + for _, rack := range dc.RackInfos { + for _, dn := range rack.DataNodeInfos { + fn(dn) + } + } + } +} + +func countShards(ecShardInfos []*master_pb.VolumeEcShardInformationMessage) (count int) { + for _, ecShardInfo := range ecShardInfos { + shardBits := erasure_coding.ShardBits(ecShardInfo.EcIndexBits) + count += shardBits.ShardIdCount() + } + return +} + +func countFreeShardSlots(dn *master_pb.DataNodeInfo) (count int) { + return int(dn.FreeVolumeCount)*10 - countShards(dn.EcShardInfos) +} diff --git a/weed/storage/erasure_coding/ec_encoder.go b/weed/storage/erasure_coding/ec_encoder.go index dbfe5858b..da0cfcde8 100644 --- a/weed/storage/erasure_coding/ec_encoder.go +++ b/weed/storage/erasure_coding/ec_encoder.go @@ -15,6 +15,7 @@ import ( const ( DataShardsCount = 10 ParityShardsCount = 4 + TotalShardsCount = DataShardsCount + ParityShardsCount ErasureCodingLargeBlockSize = 1024 * 1024 * 1024 // 1GB ErasureCodingSmallBlockSize = 1024 * 1024 // 1MB ) @@ -93,7 +94,7 @@ func encodeData(file *os.File, enc reedsolomon.Encoder, startOffset, blockSize i } func openEcFiles(baseFileName string, forRead bool) (files []*os.File, err error) { - for i := 0; i < DataShardsCount+ParityShardsCount; i++ { + for i := 0; i < TotalShardsCount; i++ { fname := baseFileName + ToExt(i) openOption := os.O_TRUNC | os.O_CREATE | os.O_WRONLY if forRead { @@ -138,7 +139,7 @@ func encodeDataOneBatch(file *os.File, enc reedsolomon.Encoder, startOffset, blo return err } - for i := 0; i < DataShardsCount+ParityShardsCount; i++ { + for i := 0; i < TotalShardsCount; i++ { _, err := outputs[i].Write(buffers[i]) if err != nil { return err @@ -154,7 +155,7 @@ func encodeDatFile(remainingSize int64, err error, baseFileName string, bufferSi if err != nil { return fmt.Errorf("failed to create encoder: %v", err) } - buffers := make([][]byte, DataShardsCount+ParityShardsCount) + buffers := make([][]byte, TotalShardsCount) outputs, err := openEcFiles(baseFileName, false) defer closeEcFiles(outputs) if err != nil { diff --git a/weed/storage/erasure_coding/ec_test.go b/weed/storage/erasure_coding/ec_test.go index 625f4e9a6..ecf73ac96 100644 --- a/weed/storage/erasure_coding/ec_test.go +++ b/weed/storage/erasure_coding/ec_test.go @@ -153,9 +153,9 @@ func readFromOtherEcFiles(ecFiles []*os.File, ecFileIndex int, ecFileOffset int6 return nil, fmt.Errorf("failed to create encoder: %v", err) } - bufs := make([][]byte, DataShardsCount+ParityShardsCount) + bufs := make([][]byte, TotalShardsCount) for i := 0; i < DataShardsCount; { - n := int(rand.Int31n(DataShardsCount + ParityShardsCount)) + n := int(rand.Int31n(TotalShardsCount)) if n == ecFileIndex || bufs[n] != nil { continue } diff --git a/weed/storage/erasure_coding/ec_volume_info.go b/weed/storage/erasure_coding/ec_volume_info.go index c26269158..ef8cc4ed4 100644 --- a/weed/storage/erasure_coding/ec_volume_info.go +++ b/weed/storage/erasure_coding/ec_volume_info.go @@ -36,6 +36,10 @@ func (ecInfo *EcVolumeInfo) ShardIds() (ret []ShardId) { return ecInfo.ShardBits.ShardIds() } +func (ecInfo *EcVolumeInfo) ShardIdCount() (count int) { + return ecInfo.ShardBits.ShardIdCount() +} + func (ecInfo *EcVolumeInfo) Minus(other *EcVolumeInfo) (*EcVolumeInfo) { ret := &EcVolumeInfo{ VolumeId: ecInfo.VolumeId, @@ -69,7 +73,7 @@ func (b ShardBits) HasShardId(id ShardId) bool { } func (b ShardBits) ShardIds() (ret []ShardId) { - for i := ShardId(0); i < DataShardsCount+ParityShardsCount; i++ { + for i := ShardId(0); i < TotalShardsCount; i++ { if b.HasShardId(i) { ret = append(ret, i) } @@ -77,6 +81,13 @@ func (b ShardBits) ShardIds() (ret []ShardId) { return } +func (b ShardBits) ShardIdCount() (count int) { + for count = 0; b > 0; count++ { + b &= b - 1 + } + return +} + func (b ShardBits) Minus(other ShardBits) (ShardBits) { return b &^ other } diff --git a/weed/topology/data_node_ec.go b/weed/topology/data_node_ec.go index e8ead5511..63c8f2127 100644 --- a/weed/topology/data_node_ec.go +++ b/weed/topology/data_node_ec.go @@ -30,11 +30,11 @@ func (dn *DataNode) UpdateEcShards(actualShards []*erasure_coding.EcVolumeInfo) } else { // found, but maybe the actual shard could be missing a := actualEcShards.Minus(ecShards) - if len(a.ShardIds()) > 0 { + if a.ShardIdCount() > 0 { newShards = append(newShards, a) } d := ecShards.Minus(actualEcShards) - if len(d.ShardIds()) > 0 { + if d.ShardIdCount() > 0 { deletedShards = append(deletedShards, d) } } diff --git a/weed/topology/topology_ec.go b/weed/topology/topology_ec.go index eb52b44b4..050a0b901 100644 --- a/weed/topology/topology_ec.go +++ b/weed/topology/topology_ec.go @@ -7,11 +7,9 @@ import ( "github.com/chrislusf/seaweedfs/weed/storage/needle" ) -const shardCount = erasure_coding.DataShardsCount + erasure_coding.ParityShardsCount - type EcShardLocations struct { Collection string - locations [shardCount][]*DataNode + locations [erasure_coding.TotalShardsCount][]*DataNode } func (t *Topology) SyncDataNodeEcShards(shardInfos []*master_pb.VolumeEcShardInformationMessage, dn *DataNode) (newShards, deletedShards []*erasure_coding.EcVolumeInfo) {