pingcap · ti-chi-bot · Sep 21, 2024 · Jul 23, 2024 · Aug 16, 2024 · Aug 17, 2024
diff --git a/lightning/pkg/importer/chunk_process.go b/lightning/pkg/importer/chunk_process.go
@@ -177,9 +177,10 @@ func (cr *chunkProcessor) process(
 	// Create the encoder.
 	kvEncoder, err := rc.encBuilder.NewEncoder(ctx, &encode.EncodingConfig{
 		SessionOptions: encode.SessionOptions{
-			SQLMode:   rc.cfg.TiDB.SQLMode,
-			Timestamp: cr.chunk.Timestamp,
-			SysVars:   rc.sysVars,
+			SQLMode:               rc.cfg.TiDB.SQLMode,
+			Timestamp:             cr.chunk.Timestamp,
+			SysVars:               rc.sysVars,
+			LogicalImportPrepStmt: rc.cfg.TikvImporter.LogicalImportPrepStmt,
 			// use chunk.PrevRowIDMax as the auto random seed, so it can stay the same value after recover from checkpoint.
 			AutoRandomSeed: cr.chunk.Chunk.PrevRowIDMax,
 		},
@@ -262,9 +263,10 @@ func (cr *chunkProcessor) encodeLoop(
 
 		originalTableEncoder, err = rc.encBuilder.NewEncoder(ctx, &encode.EncodingConfig{
 			SessionOptions: encode.SessionOptions{
-				SQLMode:   rc.cfg.TiDB.SQLMode,
-				Timestamp: cr.chunk.Timestamp,
-				SysVars:   rc.sysVars,
+				SQLMode:               rc.cfg.TiDB.SQLMode,
+				Timestamp:             cr.chunk.Timestamp,
+				SysVars:               rc.sysVars,
+				LogicalImportPrepStmt: rc.cfg.TikvImporter.LogicalImportPrepStmt,
 				// use chunk.PrevRowIDMax as the auto random seed, so it can stay the same value after recover from checkpoint.
 				AutoRandomSeed: cr.chunk.Chunk.PrevRowIDMax,
 			},

diff --git a/pkg/lightning/backend/encode/encode.go b/pkg/lightning/backend/encode/encode.go
@@ -52,9 +52,10 @@ type Encoder interface {
 
 // SessionOptions is the initial configuration of the session.
 type SessionOptions struct {
-	SQLMode   mysql.SQLMode
-	Timestamp int64
-	SysVars   map[string]string
+	SQLMode               mysql.SQLMode
+	Timestamp             int64
+	SysVars               map[string]string
+	LogicalImportPrepStmt bool
 	// a seed used for tableKvEncoder's auto random bits value
 	AutoRandomSeed int64
 	// IndexID is used by the DuplicateManager. Only the key range with the specified index ID is scanned.

diff --git a/pkg/lightning/backend/tidb/BUILD.bazel b/pkg/lightning/backend/tidb/BUILD.bazel
@@ -21,6 +21,8 @@ go_library(
         "//pkg/table",
         "//pkg/types",
         "//pkg/util/dbutil",
+        "//pkg/util/hack",
+        "//pkg/util/kvcache",
         "//pkg/util/redact",
         "@com_github_go_sql_driver_mysql//:mysql",
         "@com_github_google_uuid//:uuid",
@@ -36,7 +38,7 @@ go_test(
     timeout = "short",
     srcs = ["tidb_test.go"],
     flaky = True,
-    shard_count = 15,
+    shard_count = 16,
     deps = [
         ":tidb",
         "//pkg/errno",
@@ -58,5 +60,6 @@ go_test(
         "@com_github_go_sql_driver_mysql//:mysql",
         "@com_github_stretchr_testify//require",
         "@org_uber_go_atomic//:atomic",
+        "@org_uber_go_zap//:zap",
     ],
 )
diff --git a/pkg/lightning/backend/tidb/tidb.go b/pkg/lightning/backend/tidb/tidb.go
@@ -21,6 +21,7 @@ import (
 	"fmt"
 	"strconv"
 	"strings"
+	"sync"
 	"time"
 
 	gmysql "github.com/go-sql-driver/mysql"
@@ -42,6 +43,8 @@ import (
 	"github.com/pingcap/tidb/pkg/table"
 	"github.com/pingcap/tidb/pkg/types"
 	"github.com/pingcap/tidb/pkg/util/dbutil"
+	"github.com/pingcap/tidb/pkg/util/hack"
+	"github.com/pingcap/tidb/pkg/util/kvcache"
 	"github.com/pingcap/tidb/pkg/util/redact"
 	"go.uber.org/zap"
 	"go.uber.org/zap/zapcore"
@@ -55,12 +58,16 @@ var extraHandleTableColumn = &table.Column{
 
 const (
 	writeRowsMaxRetryTimes = 3
+	// To limit memory usage for prepared statements.
+	prepStmtCacheSize uint = 100
 )
 
 type tidbRow struct {
-	insertStmt string
-	path       string
-	offset     int64
+	insertStmt         string
+	preparedInsertStmt string
+	values             []any
+	path               string
+	offset             int64
 }
 
 var emptyTiDBRow = tidbRow{
@@ -90,8 +97,9 @@ type tidbEncoder struct {
 	// the there are enough columns.
 	columnCnt int
 	// data file path
-	path   string
-	logger log.Logger
+	path                  string
+	logger                log.Logger
+	logicalImportPrepStmt bool
 }
 
 type encodingBuilder struct{}
@@ -105,10 +113,11 @@ func NewEncodingBuilder() encode.EncodingBuilder {
 // It implements the `backend.EncodingBuilder` interface.
 func (*encodingBuilder) NewEncoder(_ context.Context, config *encode.EncodingConfig) (encode.Encoder, error) {
 	return &tidbEncoder{
-		mode:   config.SQLMode,
-		tbl:    config.Table,
-		path:   config.Path,
-		logger: config.Logger,
+		mode:                  config.SQLMode,
+		tbl:                   config.Table,
+		path:                  config.Path,
+		logger:                config.Logger,
+		logicalImportPrepStmt: config.LogicalImportPrepStmt,
 	}, nil
 }
 
@@ -287,6 +296,22 @@ func (*targetInfoGetter) CheckRequirements(ctx context.Context, _ *backend.Check
 	return nil
 }
 
+// stmtKey defines key for stmtCache.
+type stmtKey struct {
+	query string
+	// `hash` is the hash value of this object.
+	hash []byte
+}
+
+// Hash implements SimpleLRUCache.Key.
+func (k *stmtKey) Hash() []byte {
+	if len(k.hash) == 0 {
+		k.hash = make([]byte, 0, len(k.query))
+		k.hash = append(k.hash, hack.Slice(k.query)...)
+	}
+	return k.hash
+}
+
 type tidbBackend struct {
 	db          *sql.DB
 	conflictCfg config.Conflict
@@ -300,6 +325,11 @@ type tidbBackend struct {
 	// affecting the cluster too much.
 	maxChunkSize uint64
 	maxChunkRows int
+	// implement stmtCache to improve performance
+	stmtCache      *kvcache.SimpleLRUCache
+	stmtCacheMutex sync.RWMutex
+	// Indicate if the CachePrepStmts should be enabled or not
+	cachePrepStmts bool
 }
 
 var _ backend.Backend = (*tidbBackend)(nil)
@@ -333,13 +363,21 @@ func NewTiDBBackend(
 		log.FromContext(ctx).Warn("unsupported conflict strategy for TiDB backend, overwrite with `error`")
 		onDuplicate = config.ErrorOnDup
 	}
+	stmtCache := kvcache.NewSimpleLRUCache(prepStmtCacheSize, 0, 0)
+	stmtCache.SetOnEvict(func(_ kvcache.Key, value kvcache.Value) {
+		stmt := value.(*sql.Stmt)
+		stmt.Close()
+	})
 	return &tidbBackend{
-		db:           db,
-		conflictCfg:  conflict,
-		onDuplicate:  onDuplicate,
-		errorMgr:     errorMgr,
-		maxChunkSize: uint64(cfg.TikvImporter.LogicalImportBatchSize),
-		maxChunkRows: cfg.TikvImporter.LogicalImportBatchRows,
+		db:             db,
+		conflictCfg:    conflict,
+		onDuplicate:    onDuplicate,
+		errorMgr:       errorMgr,
+		maxChunkSize:   uint64(cfg.TikvImporter.LogicalImportBatchSize),
+		maxChunkRows:   cfg.TikvImporter.LogicalImportBatchRows,
+		stmtCache:      stmtCache,
+		stmtCacheMutex: sync.RWMutex{},
-		stmtCacheMutex: sync.RWMutex{},
-		stmtCacheMutex: sync.RWMutex{},
+		cachePrepStmts: cfg.TikvImporter.LogicalImportPrepStmt,
 	}
 }
 
@@ -555,16 +593,24 @@ func (enc *tidbEncoder) Encode(row []types.Datum, _ int64, columnPermutation []i
 		return emptyTiDBRow, errors.Errorf("column count mismatch, at most %d but got %d", len(enc.columnIdx), len(row))
 	}
 
-	var encoded strings.Builder
+	var encoded, preparedInsertStmt strings.Builder
+	var values []any
 	encoded.Grow(8 * len(row))
 	encoded.WriteByte('(')
+	if enc.logicalImportPrepStmt {
+		preparedInsertStmt.Grow(2 * len(row))
+		preparedInsertStmt.WriteByte('(')
+	}
 	cnt := 0
 	for i, field := range row {
 		if enc.columnIdx[i] < 0 {
 			continue
 		}
 		if cnt > 0 {
 			encoded.WriteByte(',')
+			if enc.logicalImportPrepStmt {
+				preparedInsertStmt.WriteByte(',')
+			}
 		}
 		datum := field
 		if err := enc.appendSQL(&encoded, &datum, getColumnByIndex(cols, enc.columnIdx[i])); err != nil {
@@ -575,13 +621,23 @@ func (enc *tidbEncoder) Encode(row []types.Datum, _ int64, columnPermutation []i
 			)
 			return nil, err
 		}
+		if enc.logicalImportPrepStmt {
+			preparedInsertStmt.WriteByte('?')
+			values = append(values, datum.GetValue())
+		}
 		cnt++
 	}
 	encoded.WriteByte(')')
+	if enc.logicalImportPrepStmt {
+		preparedInsertStmt.WriteByte(')')
+	}
+
 	return tidbRow{
-		insertStmt: encoded.String(),
-		path:       enc.path,
-		offset:     offset,
+		insertStmt:         encoded.String(),
+		preparedInsertStmt: preparedInsertStmt.String(),
+		values:             values,
+		path:               enc.path,
+		offset:             offset,
 	}, nil
 }
 
@@ -664,8 +720,9 @@ rowLoop:
 }
 
 type stmtTask struct {
-	rows tidbRows
-	stmt string
+	rows   tidbRows
+	stmt   string
+	values []any
 }
 
 // WriteBatchRowsToDB write rows in batch mode, which will insert multiple rows like this:
@@ -678,14 +735,20 @@ func (be *tidbBackend) WriteBatchRowsToDB(ctx context.Context, tableName string,
 	}
 	// Note: we are not going to do interpolation (prepared statements) to avoid
 	// complication arise from data length overflow of BIT and BINARY columns
+	var values []any
 	stmtTasks := make([]stmtTask, 1)
 	for i, row := range rows {
 		if i != 0 {
 			insertStmt.WriteByte(',')
 		}
-		insertStmt.WriteString(row.insertStmt)
+		if be.cachePrepStmts {
+			insertStmt.WriteString(row.preparedInsertStmt)
+			values = append(values, row.values...)
+		} else {
+			insertStmt.WriteString(row.insertStmt)
+		}
 	}
-	stmtTasks[0] = stmtTask{rows, insertStmt.String()}
+	stmtTasks[0] = stmtTask{rows, insertStmt.String(), values}
 	return be.execStmts(ctx, stmtTasks, tableName, true)
 }
 
@@ -715,7 +778,7 @@ func (be *tidbBackend) WriteRowsToDB(ctx context.Context, tableName string, colu
 		var finalInsertStmt strings.Builder
 		finalInsertStmt.WriteString(is)
 		finalInsertStmt.WriteString(row.insertStmt)
-		stmtTasks = append(stmtTasks, stmtTask{[]tidbRow{row}, finalInsertStmt.String()})
+		stmtTasks = append(stmtTasks, stmtTask{[]tidbRow{row}, finalInsertStmt.String(), []any{}})
 	}
 	return be.execStmts(ctx, stmtTasks, tableName, false)
 }
@@ -753,8 +816,27 @@ stmtLoop:
 			err    error
 		)
 		for i := 0; i < writeRowsMaxRetryTimes; i++ {
-			stmt := stmtTask.stmt
-			result, err = be.db.ExecContext(ctx, stmt)
+			query := stmtTask.stmt
+			if be.cachePrepStmts {
+				var prepStmt *sql.Stmt
+				key := &stmtKey{query: query}
+				be.stmtCacheMutex.RLock()
+				stmt, ok := be.stmtCache.Get(key)
+				be.stmtCacheMutex.RUnlock()
+				if ok {
+					prepStmt = stmt.(*sql.Stmt)
+				} else if stmt, err := be.db.Prepare(query); err == nil {
+					prepStmt = stmt
+					be.stmtCacheMutex.Lock()
+					be.stmtCache.Put(key, stmt)
+					be.stmtCacheMutex.Unlock()
+				} else {
+					return errors.Trace(err)
+				}
+				result, err = prepStmt.ExecContext(ctx, stmtTask.values...)
+			} else {
+				result, err = be.db.ExecContext(ctx, query)
+			}
 			if err == nil {
 				affected, err2 := result.RowsAffected()
 				if err2 != nil {
@@ -775,7 +857,7 @@ stmtLoop:
 
 			if !common.IsContextCanceledError(err) {
 				log.FromContext(ctx).Error("execute statement failed",
-					zap.Array("rows", stmtTask.rows), zap.String("stmt", redact.Value(stmt)), zap.Error(err))
+					zap.Array("rows", stmtTask.rows), zap.String("stmt", redact.Value(query)), zap.Error(err))
 			}
 			// It's batch mode, just return the error. Caller will fall back to row-by-row mode.
 			if batch {