#### 20191219 10:10:10,234 | com.alibaba.druid.filter.logging.Log4jFilter.statementLogError(Log4jFilter.java:152) | ERROR | {conn-10593, pstmt-38675} execute error. update operation_service set offlinemark = ? , resourcestatus = ? where RowGuid = ?
com.mysql.jdbc.exceptions.jdbc4.MySQLTransactionRollbackException: Lock wait timeout exceeded; try restarting transaction
上述这个错误,接触 MySQL 的同学或多或少应该都遇到过,专业一点来说,这个报错我们称之为
锁等待超时
。
下面先写下统一的模拟场景,用于复现行锁超时问题,便于大家理解:
--表结构
CREATE TABLE `emp` (
`id` int(11) NOT NULL,
KEY `idx_id` (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4
从1~100w插入100w行记录。
--测试过程:
事务1:
start transaction;
delete from emp where id = 1;
select * from emp where id in (select id from emp); -->模拟慢查询,执行时间很久,事务因此一直不提交,行锁也不释放.
commit;
事务2:
start transaction;
delete from emp where id < 10; --> 处于等待id=1的行锁状态,当达到行锁超时时间(这里我配置了超时时间为 5s)后,返回行锁超时报错
rollback;
5.1 手动复现场景
这个场景通常只需要通过 innodb 行锁等待脚本就可以知道当前 MySQL 的 innodb 行锁等待情况,例如我们一边模拟上述报错场景(模拟页面操作),另一边使用脚本查询(需要在超时之前查询,否则超时报错后就看不到了)。
/*innodb 行锁等待脚本*/
SELECT r.trx_mysql_thread_id waiting_thread,r.trx_query waiting_query,
concat(timestampdiff(SECOND,r.trx_wait_started,CURRENT_TIMESTAMP()),'s') AS duration,
b.trx_mysql_thread_id blocking_thread,t.processlist_command state,b.trx_query blocking_current_query,e.sql_text blocking_last_query
FROM information_schema.innodb_lock_waits w
JOIN information_schema.innodb_trx b ON b.trx_id = w.blocking_trx_id
JOIN information_schema.innodb_trx r ON r.trx_id = w.requesting_trx_id
JOIN performance_schema.threads t on t.processlist_id = b.trx_mysql_thread_id
JOIN performance_schema.events_statements_current e USING(thread_id)
--查看general_log是否开启,及文件名
mysql> show variables like '%general_log%';
+------------------+--------------------------------------+
| Variable_name | Value |
+------------------+--------------------------------------+
| general_log | OFF |
| general_log_file | /data/mysql_data/192-168-188-155.log |
+------------------+--------------------------------------+
--暂时开启general_log
mysql> set global general_log = 1;
Query OK, 0 rows affected (0.00 sec)
--暂时关闭general_log
mysql> set global general_log = 0;
Query OK, 0 rows affected (0.00 sec)
开启 general_
log 后,手动复现的时候通过 innodb 行锁等待脚本查询结果中的线程 ID,去 general_
log 找到对应的事务分析即可,如下:
我一般是通过在 Linux 上后台跑监控脚本(innodb_
lock_
monitor.sh)来记录 MySQL 阻塞情况,脚本如下:
#!/bin/bash
#账号、密码、监控日志
user="root"
password="Gepoint"
logfile="/root/innodb_lock_monitor.log"
while true
do
num=`mysql -u${user} -p${password} -e "select count(*) from information_schema.innodb_lock_waits" |grep -v count`
if [[ $num -gt 0 ]];then
date >> /root/innodb_lock_monitor.log
mysql -u${user} -p${password} -e "SELECT r.trx_mysql_thread_id waiting_thread,r.trx_query waiting_query, \
concat(timestampdiff(SECOND,r.trx_wait_started,CURRENT_TIMESTAMP()),'s') AS duration,\
b.trx_mysql_thread_id blocking_thread,t.processlist_command state,b.trx_query blocking_query,e.sql_text \
FROM information_schema.innodb_lock_waits w \
JOIN information_schema.innodb_trx b ON b.trx_id = w.blocking_trx_id \
JOIN information_schema.innodb_trx r ON r.trx_id = w.requesting_trx_id \
JOIN performance_schema.threads t on t.processlist_id = b.trx_mysql_thread_id \
JOIN performance_schema.events_statements_current e USING(thread_id) \G " >> ${logfile}
fi
sleep 5
done
--使用 nohup 命令后台运行监控脚本
[root@192-168-188-155 ~]# nohup sh innodb_lock_monitor.sh &
[2] 31464
nohup: ignoring input and appending output to ‘nohup.out’
--查看 nohup.out 是否出现报错
[root@192-168-188-155 ~]# tail -f nohup.out
mysql: [Warning] Using a password on the command line interface can be insecure.
mysql: [Warning] Using a password on the command line interface can be insecure.
mysql: [Warning] Using a password on the command line interface can be insecure.
--定时查看监控日志是否有输出(没有输出的话,这个日志也不会生成哦!)
[root@192-168-188-155 ~]# tail -f innodb_lock_monitor.log
Wed Feb 5 11:30:11 CST 2020
*************************** 1. row ***************************
waiting_thread: 112
waiting_query: delete from emp where id < 10
duration: 3s
blocking_thread: 111
state: Sleep
blocking_query: NULL
sql_text: select * from emp where id in (select id from emp)
缺点:
1. history_
long 相关表默认保留记录有限,可能会把有用的数据刷掉,尤其是在 SQL 运行较多的系统。
2. 如果要加大 history_
long 相关表的最大保留行数,需要重启 MySQL,无法在线修改参数。
3. history_
long 相关表记录中的时间均为相对时间,也就是距离 MySQL 启动的时长,看起来不是很方便。
4. history_
long 相关表不会主动记录行锁等待的信息,所以只能通过先根据时间范围刷选出可疑的事务,再进一步分析,不如脚本监控定位的准。
/*开启performance_schema相关监控项,需要提前开启performance_schema*/
UPDATE performance_schema.setup_instruments SET ENABLED = 'YES', TIMED = 'YES' where name = 'transaction';
UPDATE performance_schema.setup_consumers SET ENABLED = 'YES' where name like '%events_transactions%';
UPDATE performance_schema.setup_consumers SET ENABLED = 'YES' where name like '%events_statements%';
/*查看回滚事务SQL,确认是否是日志里报错的事务*/
SELECT a.THREAD_ID
,b.EVENT_ID
,a.EVENT_NAME
,CONCAT (b.TIMER_WAIT / 1000000000000,'s') AS trx_druation
,CONCAT (a.TIMER_WAIT / 1000000000000,'s') sql_druation
,a.SQL_TEXT,b.STATE,a.MESSAGE_TEXT
FROM performance_schema.events_statements_history_long a
JOIN performance_schema.events_transactions_history_long b ON a.THREAD_ID = b.THREAD_ID
AND (a.NESTING_EVENT_ID = b.EVENT_ID OR a.EVENT_ID = b.NESTING_EVENT_ID)
WHERE b.autocommit = 'NO' AND a.SQL_TEXT IS NOT NULL AND b.STATE = 'ROLLED BACK'
/*查看该时间段内可疑事务即超过5s的事务SQL,这里默认innodb_lock_wait_timeout为5s*/
SELECT a.THREAD_ID
,b.EVENT_ID
,a.EVENT_NAME
,CONCAT (b.TIMER_WAIT / 1000000000000,'s') AS trx_druation
,CONCAT (a.TIMER_WAIT / 1000000000000,'s') sql_druation
,a.SQL_TEXT,b.STATE,a.MESSAGE_TEXT,a.ROWS_AFFECTED,a.ROWS_EXAMINED,a.ROWS_SENT
FROM performance_schema.events_statements_history_long a
JOIN performance_schema.events_transactions_history_long b ON a.THREAD_ID = b.THREAD_ID
AND (a.NESTING_EVENT_ID = b.EVENT_ID OR a.EVENT_ID = b.NESTING_EVENT_ID)
WHERE b.autocommit = 'NO' AND SQL_TEXT IS NOT NULL AND b.STATE = 'COMMITTED'
AND b.TIMER_WAIT / 1000000000000 > 5
AND b.TIMER_START < (SELECT TIMER_START FROM performance_schema.events_transactions_history_long
WHERE THREAD_ID = 70402 /*上述SQL查询结果中的线程ID*/
AND EVENT_ID = 518) /*上述SQL查询结果中的事件ID*/
AND b.TIMER_END > ( SELECT TIMER_END FROM performance_schema.events_transactions_history_long
WHERE THREAD_ID = 70402 /*上述SQL查询结果中的线程ID*/
AND EVENT_ID = 518) /*上述SQL查询结果中的事件ID*/
ORDER BY a.THREAD_ID