常规监控
-
连接数使用超过95%告警,最大连接数和当前连接数
select (select count(*) from pg_stat_activity) /(select setting from pg_settings where name = 'max_connections') ::numeric >0.95;
-
存在空闲连接idle in transaction大于5秒的连,如果长期存在这样的连接说明应用有问题,并且会造成数据库出现异常
select count(*) from pg_stat_activity where state = 'idle in transaction' and now()-query_start>'5s'::interval);
-
长事务,超过30分钟的慢sql
select count(*) from pg_stat_activity where state <> 'idle' and (backend_xid is not null or backend_xmin is not null) and now()-xact_start > interval '1800 sec'::interval;
-
数据库占用空间
select pg_size_pretty(pg_database_size('postgres'));select datname, pg_size_pretty(pg_database_size(datname)) from pg_database;
-
数据库中存在长时间阻塞的锁,阻塞超过3分钟
selectkl.pid as 阻塞pid,a.usename as blocked_user,ka.query as 阻塞sql,ka.state as 阻塞sql状态,now()- ka.query_start as 阻塞语句执行时长,--阻塞的语句执行时长bl.pid as 被阻塞pid,ka.usename as 被阻塞用户,a.query as 被阻塞sql,now()- a.query_start as 被阻塞语句执行时长--被阻塞语句执行时长frompg_locks bljoin pg_stat_activity a ona.pid = bl.pidjoin pg_locks kl onkl.transactionid = bl.transactionidand kl.pid != bl.pidjoin pg_stat_activity ka onka.pid = kl.pidwherenot bl.grantedand now()- ka.query_start >'60s'::interval;--阻塞时长超过1分钟--阻塞pid对应的是数据库中pid,可以使用select pg_terminate_backend(pid);进行kill
-
坏元组超过10000行,且占比超过20%的表
select current_database(),schemaname||'.'||relname,n_dead_tup,n_live_tup,round(n_dead_tup*100/ (n_live_tup )::numeric,2)||'%' AS dead_tup_ratioFROMpg_stat_all_tablesWHEREn_dead_tup >= 10000 and n_live_tup>0 and (n_dead_tup)/(n_live_tup)::numeric>0.25 and schemaname not in ('pg_toast','pg_catalog','information_schema')ORDER BY dead_tup_ratio DESCLIMIT 10;
-
wal日志生成量
--'AD/FAFFF2A8'是5分钟前select pg_current_wal_lsn()获取的值select pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(),'AD/FAFFF2A8'));
-
表缓存命中率和索引缓存命中率,命中率低于80%说明缓存不足
-
注意,不常用的库缓存可能为0
SELECT'index hit rate' AS name,(sum(idx_blks_hit)) / nullif(sum(idx_blks_hit + idx_blks_read),0) AS ratioFROM pg_statio_user_indexesUNION ALLSELECT'table hit rate' AS name,sum(heap_blks_hit) / nullif(sum(heap_blks_hit) + sum(heap_blks_read),0) AS ratioFROM pg_statio_user_tables;
-
数据库年龄监控
-
当age达到2亿会触发自动清理,如果大于5亿则告警
select datname,age(datfrozenxid),pg_size_pretty(pg_database_size(oid)) from pg_database where age(datfrozenxid) > 500000000 order by age(datfrozenxid) desc limit 10 ;
-
表年龄(用于查看详情,默认数据库年龄会存储表年龄最大的一条)
select sp.nspname,relname,age(relfrozenxid), pg_size_pretty(pg_table_size(cl.oid)) from pg_class cl join pg_namespace sp on cl.relnamespace = sp.oid where relkind in ('t','r') order by age(relfrozenxid) desc limit 10;-- 说明:当age到达2亿(默认)时触发自动清理,期间会大量占用系统资源。提前做好监控避免在业务高峰时发生。可在库级别操作,也可在表基本操作。
-
事务提交,事务回滚,全表扫描,索引扫描,插入记录,更新记录
select sum(xact_commit),sum(xact_rollback),sum(tup_returned),sum(tup_fetched),sum(tup_inserted),sum(tup_updated),sum(tup_deleted),sum(conflicts),sum(deadlocks) from pg_stat_database;-- pg_stat_get_db_xact_commit 为stable函数,一个事务中两次调用之间只执行一次,所以需要外部多次执行。
回滚率计算:select xact_rollback/(xact_commit+xact_rollback),如果超过10%则代表回滚率较高
-
过去5分钟内生成wal个数,可以判断当前数据库的dml频繁程度
select count(1) from pg_catalog.pg_ls_waldir() where modification > CURRENT_TIMESTAMP - '5 minutes' :: INTERVAL ;
-
wal写入速率
-
lsn是一个递增的值
SELECT CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END - '0/0' as wal_lsn;
高可用,逻辑复制
-
现场使用arrs或者高可用,出现异常后并不知情,可监控复制状态,以及日志堆积大小判断当前复制情况
pg10+
-
复制状态,true表示正常,false表示异常
select active from pg_replication_slots;
-
备库applay延迟,以及堆积情况
selectapplication_name,client_addr,client_hostname,client_port,state,sync_priority,sync_state,pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(),replay_lsn)),pg_wal_lsn_diff(pg_current_wal_lsn(),write_lsn) write_delay,pg_wal_lsn_diff(pg_current_wal_lsn(),flush_lsn) flush_delay,pg_wal_lsn_diff(pg_current_wal_lsn(),replay_lsn) replay_delyfrom pg_stat_replication ;
-
arrs复制延迟,以及堆积情况
select slot_name, plugin, slot_type,temporary, active, active_pid,pg_size_pretty(pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)) frompg_replication_slots ;
pg10以下
-
备库applay延迟,复制堆积超过1GB
selectapplication_name,client_addr,client_hostname,client_port,state,sync_priority,sync_state,pg_size_pretty(pg_xlog_location_diff(pg_current_xlog_location(),replay_location)),pg_xlog_location_diff(pg_current_xlog_location(),write_location) write_delay,pg_xlog_location_diff(pg_current_xlog_location(),flush_location) flush_delay,pg_xlog_location_diff(pg_current_xlog_location(),replay_location) replay_delyfrom pg_stat_replication;select slot_name, plugin, slot_type,active, active_pid,pg_size_pretty(pg_xlog_location_diff(pg_current_xlog_location(), restart_lsn)) frompg_replication_slots;
-
主从流复制延时时间 (从库执行)
SELECT CASE WHEN pg_last_xlog_receive_location() = pg_last_xlog_replay_location() THEN 0 ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) END10 版本及以后SELECT CASE WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 ELSE EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()) END;-- 主从复制延迟字节 (主库执行)select greatest(0,pg_wal_lsn_diff(pg_current_wal_lsn(), replay_lsn)) from pg_stat_replication ; where client_addr = '从库
慢SQL监控
-
数据库需要安装插件pg_stat_statements
-
在数据库中当前频繁执行的SQL TOP5
SELECT C.rolname,b.datname,A.total_time / A.calls per_call_time,A.*FROMpg_stat_statements A,pg_database b,pg_authid CWHEREA.userid = C.oidAND A.dbid = b.oidORDER BYA.total_time DESCLIMIT 5
-
最耗时 SQL,单次调用最耗时 SQL TOP5
select userid::regrole, dbid, query from pg_stat_statements order by mean_time desc limit 5;
-
最耗共享内存 SQL
select userid::regrole, dbid, query from pg_stat_statements order by (shared_blks_hit+shared_blks_dirtied) desc limit 5;
-
总最耗IO SQL TOP5
select userid::regrole, dbid, query from pg_stat_statements order by (blk_read_time+blk_write_time) desc limit 5;
-
最耗IO SQL,单次调用最耗IO SQL TOP5
select userid::regrole, dbid, query from pg_stat_statements order by (blk_read_time+blk_write_time)/calls desc limit 5;
-
QPS
witha as(select sum(calls)s from pg_stat_statements),b as(select sum(calls)s from pg_stat_statements,pg_sleep(1))select b.s-a.s --QPSfrom a,b;
-
长期(30天)未更新统计信息的表,坏元组比例超过10%就应该触发更新统计信息(警告)
SELECTto_char((now() - last_analyze), 'DD'), to_char((now() - last_autoanalyze),'DD' ) ,current_database(),schemaname||'.'||relname,n_live_tup,n_dead_tup,round(n_dead_tup*100/n_live_tup::numeric,2)||'%' as dead_tup_ratioFROMpg_stat_all_tablesWHEREschemaname not in( 'pg_toast','pg_catalog','information_schema' )AND (last_analyze IS null OR to_char((now() - last_analyze), 'DD') :: NUMERIC > 30)AND (last_autovacuum IS null OR to_char((now() - last_autovacuum),'DD') :: NUMERIC > 30)AND (last_autoanalyze IS null OR to_char((now() - last_autoanalyze),'DD' ) :: NUMERIC > 30)and n_dead_tup >= 10000 and n_live_tup>0 and (n_dead_tup)/(n_live_tup)::numeric > 0.1;
-
未使用的索引
SELECTschemaname || '.' || relname AS table,indexrelname AS index,pg_size_pretty(pg_relation_size(i.indexrelid)) AS index_size,idx_scan as index_scansFROM pg_stat_user_indexes uiJOIN pg_index i ON ui.indexrelid = i.indexrelidWHERE NOT indisuniqueAND idx_scan < 5AND pg_relation_size(relid) > 5 * 8192ORDER BY pg_relation_size(i.indexrelid) / nullif(idx_scan, 0) DESC NULLS FIRST,pg_relation_size(i.indexrelid) DESC;--修改后:SELECTschemaname || '.' || relname AS table,indexrelname AS index,pg_size_pretty(pg_relation_size(i.indexrelid)) AS index_size,pg_size_pretty(pg_relation_size(relid)),idx_scan as index_scansFROM pg_stat_user_indexes uiJOIN pg_index i ON ui.indexrelid = i.indexrelidWHERE NOT indisuniqueAND idx_scan < 5AND pg_relation_size(relid) > 5 * 8192*1000ORDER BY pg_relation_size(i.indexrelid) desc limit 5;
-
表未创建主键
SELECTcurrent_database (),ns.nspname,obj.relnameFROMpg_class obj,pg_namespace nsWHEREobj.relnamespace = ns.oidAND obj.relkind = 'r'AND ns.nspname NOT IN ('pg_toast','pg_catalog','information_schema')AND obj.oid NOT IN (SELECTobj.oid FROMpg_class obj,pg_index idxWHEREobj.oid = idx.indrelidAND idx.indisprimary = 'true')limit 10;
-
数据库中存在触发器
SELECTcurrent_database (),ns.nspname,rel.relname,tri.tgnameFROMpg_class rel,pg_namespace ns,pg_trigger triWHEREns.nspname NOT IN ('pg_toast','pg_catalog','information_schema')AND rel.oid = tri.tgrelidAND rel.relnamespace = ns.oidAND rel.relhastriggers = true and tgisinternal= = false;
-
上班期间存在备份进程
select st.application_name,st.datname,st.query_start,st.client_addr,client_hostname,query from pg_stat_activity st where state='active' and application_name='pg_dump';
-
上班期间存在创建索引进程
select st.application_name,st.datname,st.query_start,st.client_addr,client_hostnamefrom pg_stat_activity st where state='active' and (st.query ilike 'create index%' or st.query ilike 'create uniuqe index%');
-
上班期间重建索引
select st.application_name,st.datname,st.query_start,st.client_addr,client_hostname from pg_stat_activity st where state='active' and query ilike 'reindex%';
-
vacuum full进程
select st.application_name,st.datname,st.query_start,st.client_addr,client_hostname,query from pg_stat_activity st where state='active' and query ilike 'vacuum full%';
-
重复索引
select allindx.schemaname,allindx.relname,allindx.indexrelname,ind.indexdef FROM pg_stat_all_indexes allindx,pg_indexes ind where allindx.relname=ind.tablename and allindx.indexrelname=ind.indexname and allindx.indexrelid :: VARCHARIN (SELECT regexp_split_to_table(indexs, $$,$$)FROM(SELECT string_agg (idx :: VARCHAR, $$,$$) AS indexsFROM(SELECT indexrelid :: oid AS idx,(indrelid :: TEXT || $$BB$$ || indclass :: TEXT || $$BB$$ || indkey :: TEXT || $$BB$$ || COALESCE (indexprs :: TEXT, $$BB$$) || $$BB$$ || COALESCE (indpred :: TEXT, $$BB$$)) AS val FROM pg_index ) subGROUP BYval HAVINGCOUNT (*) > 1ORDER BY SUM (pg_relation_size(idx)) desc) arr) and allindx.schemaname not in ('pg_toast','pg_catalog');