2021-09-08 14:36:04 INFO Listening on unix socket file: /tmp/gh-ost.peppa.user_details.sock
2021-09-08 14:36:15 INFO As instructed, counting rows in the background; meanwhile I will use an estimated count, and will update it later on
this.addDMLEventsListener()
开始侦听原始表上的 binlog 事件,并为每个此类事件创建和排队写入任务。
2021-09-08 14:37:57 INFO As instructed, I'm issuing a SELECT COUNT(*) on the table. This may take a while
this.applier.ReadMigrationRangeValues()
读取将用于行复制的最小/最大值。
2021-09-08 14:40:17 INFO Exact number of rows via COUNT: 730000
2021-09-08 14:40:17 INFO Migration min values: [0]
2021-09-08 14:40:17 INFO Migration max values: [99999]
this.initiateThrottler()
启动节流收集和节流检查。
2021-09-08 14:41:24 INFO Waiting for first throttle metrics to be collected
2021-09-08 14:41:24 INFO First throttle metrics collected
func (this *Migrator) executeWriteFuncs() error {
if this.migrationContext.Noop {
this.migrationContext.Log.Debugf("Noop operation; not really executing write funcs")
return nil
}
for {
if atomic.LoadInt64(&this.finishedMigrating) > 0 {
return nil
}
this.throttler.throttle(nil)
// We give higher priority to event processing, then secondary priority to
// rowcopy
select {
case eventStruct := <-this.applyEventsQueue:
{
if err := this.onApplyEventStruct(eventStruct); err != nil {
return err
}
}
default:
{
select {
case copyRowsFunc := <-this.copyRowsQueue:
{
copyRowsStartTime := time.Now()
// Retries are handled within the copyRowsFunc
if err := copyRowsFunc(); err != nil {
return this.migrationContext.Log.Errore(err)
}
if niceRatio := this.migrationContext.GetNiceRatio(); niceRatio > 0 {
copyRowsDuration := time.Since(copyRowsStartTime)
sleepTimeNanosecondFloat64 := niceRatio * float64(copyRowsDuration.Nanoseconds())
sleepTime := time.Duration(time.Duration(int64(sleepTimeNanosecondFloat64)) * time.Nanosecond)
time.Sleep(sleepTime)
}
}
default:
{
// Hmmmmm... nothing in the queue; no events, but also no row copy.
// This is possible upon load. Let's just sleep it over.
this.migrationContext.Log.Debugf("Getting nothing in the write queue. Sleeping...")
time.Sleep(time.Second)
}
}
}
}
}
return nil
}
go this.iterateChunks()
迭代现有的表行,并将多行的复制任务生成到幽灵表上。
func (this *Migrator) iterateChunks() error {
terminateRowIteration := func(err error) error {
this.rowCopyComplete <- err
return this.migrationContext.Log.Errore(err)
}
if this.migrationContext.Noop {
this.migrationContext.Log.Debugf("Noop operation; not really copying data")
return terminateRowIteration(nil)
}
if this.migrationContext.MigrationRangeMinValues == nil {
this.migrationContext.Log.Debugf("No rows found in table. Rowcopy will be implicitly empty")
return terminateRowIteration(nil)
}
var hasNoFurtherRangeFlag int64
// Iterate per chunk:
for {
if atomic.LoadInt64(&this.rowCopyCompleteFlag) == 1 || atomic.LoadInt64(&hasNoFurtherRangeFlag) == 1 {
// Done
// There's another such check down the line
return nil
}
copyRowsFunc := func() error {
if atomic.LoadInt64(&this.rowCopyCompleteFlag) == 1 || atomic.LoadInt64(&hasNoFurtherRangeFlag) == 1 {
// Done.
// There's another such check down the line
return nil
}
// When hasFurtherRange is false, original table might be write locked and CalculateNextIterationRangeEndValues would hangs forever
hasFurtherRange := false
if err := this.retryOperation(func() (e error) {
hasFurtherRange, e = this.applier.CalculateNextIterationRangeEndValues()
return e
}); err != nil {
return terminateRowIteration(err)
}
if !hasFurtherRange {
atomic.StoreInt64(&hasNoFurtherRangeFlag, 1)
return terminateRowIteration(nil)
}
// Copy task:
applyCopyRowsFunc := func() error {
if atomic.LoadInt64(&this.rowCopyCompleteFlag) == 1 {
// No need for more writes.
// This is the de-facto place where we avoid writing in the event of completed cut-over.
// There could _still_ be a race condition, but that's as close as we can get.
// What about the race condition? Well, there's actually no data integrity issue.
// when rowCopyCompleteFlag==1 that means **guaranteed** all necessary rows have been copied.
// But some are still then collected at the binary log, and these are the ones we're trying to
// not apply here. If the race condition wins over us, then we just attempt to apply onto the
// _ghost_ table, which no longer exists. So, bothering error messages and all, but no damage.
return nil
}
_, rowsAffected, _, err := this.applier.ApplyIterationInsertQuery()
if err != nil {
return err // wrapping call will retry
}
atomic.AddInt64(&this.migrationContext.TotalRowsCopied, rowsAffected)
atomic.AddInt64(&this.migrationContext.Iteration, 1)
return nil
}
if err := this.retryOperation(applyCopyRowsFunc); err != nil {
return terminateRowIteration(err)
}
return nil
}
// Enqueue copy operation; to be executed by executeWriteFuncs()
this.copyRowsQueue <- copyRowsFunc
}
return nil
}
func (this *Migrator) cutOver() (err error) {
if this.migrationContext.Noop {
this.migrationContext.Log.Debugf("Noop operation; not really swapping tables")
return nil
}
this.migrationContext.MarkPointOfInterest()
this.throttler.throttle(func() {
this.migrationContext.Log.Debugf("throttling before swapping tables")
})
this.migrationContext.MarkPointOfInterest()
this.migrationContext.Log.Debugf("checking for cut-over postpone")
this.sleepWhileTrue(
func() (bool, error) {
heartbeatLag := this.migrationContext.TimeSinceLastHeartbeatOnChangelog()
maxLagMillisecondsThrottle := time.Duration(atomic.LoadInt64(&this.migrationContext.MaxLagMillisecondsThrottleThreshold)) * time.Millisecond
cutOverLockTimeout := time.Duration(this.migrationContext.CutOverLockTimeoutSeconds) * time.Second
if heartbeatLag > maxLagMillisecondsThrottle || heartbeatLag > cutOverLockTimeout {
this.migrationContext.Log.Debugf("current HeartbeatLag (%.2fs) is too high, it needs to be less than both --max-lag-millis (%.2fs) and --cut-over-lock-timeout-seconds (%.2fs) to continue", heartbeatLag.Seconds(), maxLagMillisecondsThrottle.Seconds(), cutOverLockTimeout.Seconds())
return true, nil
}
if this.migrationContext.PostponeCutOverFlagFile == "" {
return false, nil
}
if atomic.LoadInt64(&this.migrationContext.UserCommandedUnpostponeFlag) > 0 {
atomic.StoreInt64(&this.migrationContext.UserCommandedUnpostponeFlag, 0)
return false, nil
}
if base.FileExists(this.migrationContext.PostponeCutOverFlagFile) {
// Postpone file defined and exists!
if atomic.LoadInt64(&this.migrationContext.IsPostponingCutOver) == 0 {
if err := this.hooksExecutor.onBeginPostponed(); err != nil {
return true, err
}
}
atomic.StoreInt64(&this.migrationContext.IsPostponingCutOver, 1)
return true, nil
}
return false, nil
},
)
atomic.StoreInt64(&this.migrationContext.IsPostponingCutOver, 0)
this.migrationContext.MarkPointOfInterest()
this.migrationContext.Log.Debugf("checking for cut-over postpone: complete")
if this.migrationContext.TestOnReplica {
// With `--test-on-replica` we stop replication thread, and then proceed to use
// the same cut-over phase as the master would use. That means we take locks
// and swap the tables.
// The difference is that we will later swap the tables back.
if err := this.hooksExecutor.onStopReplication(); err != nil {
return err
}
if this.migrationContext.TestOnReplicaSkipReplicaStop {
this.migrationContext.Log.Warningf("--test-on-replica-skip-replica-stop enabled, we are not stopping replication.")
} else {
this.migrationContext.Log.Debugf("testing on replica. Stopping replication IO thread")
if err := this.retryOperation(this.applier.StopReplication); err != nil {
return err
}
}
}
if this.migrationContext.CutOverType == base.CutOverAtomic {
// Atomic solution: we use low timeout and multiple attempts. But for
// each failed attempt, we throttle until replication lag is back to normal
err := this.atomicCutOver()
this.handleCutOverResult(err)
return err
}
if this.migrationContext.CutOverType == base.CutOverTwoStep {
err := this.cutOverTwoStep()
this.handleCutOverResult(err)
return err
}
return this.migrationContext.Log.Fatalf("Unknown cut-over type: %d; should never get here!", this.migrationContext.CutOverType)
}