在上一篇PostgreSQL源码分析——基础备份中,我们分析了PG中基础备份的过程以及源码,备份与恢复是不分离的,这里我们继续分析一下,从基础备份中进行恢复的源码。
备份过程
执行备份:
postgres=# select pg_start_backup('bak3');pg_start_backup
-----------------0/6000060
(1 row)postgres=# insert into t1 values(5);
INSERT 0 1
postgres=# select pg_stop_backup();
NOTICE: WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backuppg_stop_backup
----------------0/60002D8
(1 row)
查看日志:
postgres@slpc:~/pgsql/pgdata/pg_wal$ pg_waldump -p ../pg_wal 000000010000000000000006
rmgr: Standby len (rec/tot): 50/ 50, tx: 0, lsn: 0/06000028, prev 0/05000110, desc: RUNNING_XACTS nextXid 738 latestCompletedXid 737 oldestRunningXid 738
rmgr: Standby len (rec/tot): 50/ 50, tx: 0, lsn: 0/06000060, prev 0/06000028, desc: RUNNING_XACTS nextXid 738 latestCompletedXid 737 oldestRunningXid 738
rmgr: XLOG len (rec/tot): 114/ 114, tx: 0, lsn: 0/06000098, prev 0/06000060, desc: CHECKPOINT_ONLINE redo 0/6000060; tli 1; prev tli 1; fpw true; xid 0:738; oid 16387; multi 1; offset 0; oldest xid 726 in DB 1; oldest multi 1 in DB 1; oldest/newest commit timestamp xid: 0/0; oldest running xid 738; online
rmgr: Standby len (rec/tot): 50/ 50, tx: 0, lsn: 0/06000110, prev 0/06000098, desc: RUNNING_XACTS nextXid 738 latestCompletedXid 737 oldestRunningXid 738
rmgr: Heap len (rec/tot): 54/ 258, tx: 738, lsn: 0/06000148, prev 0/06000110, desc: INSERT off 5 flags 0x00, blkref #0: rel 1663/13010/16384 blk 0 FPW
rmgr: Transaction len (rec/tot): 34/ 34, tx: 738, lsn: 0/06000250, prev 0/06000148, desc: COMMIT 2023-09-18 14:40:06.694650 CST
rmgr: Standby len (rec/tot): 50/ 50, tx: 0, lsn: 0/06000278, prev 0/06000250, desc: RUNNING_XACTS nextXid 739 latestCompletedXid 738 oldestRunningXid 739
rmgr: XLOG len (rec/tot): 34/ 34, tx: 0, lsn: 0/060002B0, prev 0/06000278, desc: BACKUP_END 0/6000060
rmgr: XLOG len (rec/tot): 24/ 24, tx: 0, lsn: 0/060002D8, prev 0/060002B0, desc: SWITCH
查看backup_label文件:
postgres@slpc:~/pgsql/pgbak2$ cat backup_label
START WAL LOCATION: 0/6000060 (file 000000010000000000000006)
CHECKPOINT LOCATION: 0/6000098
BACKUP METHOD: pg_start_backup
BACKUP FROM: primary
START TIME: 2023-09-18 14:39:50 CST
LABEL: bak3
START TIMELINE: 1
恢复源码分析
启动备份数据库,检测到有backup_label文件时,则认为是从一个备份文件中进行恢复,读取backup_label中的检查点信息,而不是从pg_control中读取。
main(int argc, char *argv[])
--> PostmasterMain(argc, argv);--> LocalProcessControlFile(false); // 读pg_control文件--> StartupPID = StartupDataBase(); // 启动startup子进程--> StartupProcessMain();--> StartupXLOG();--> ValidateXLOGDirectoryStructure(); // Verify that pg_wal and pg_wal/archive_status exist.--> readRecoverySignalFile(); // Check for signal files, and if so set up state for offline recovery--> validateRecoveryParameters();--> XLogReaderAllocate // Allocate and initialize a new XLogReader.// 是否存在backup_label文件,如果存在的话,则认为是从一个备份文件进行恢复--> read_backup_label(&checkPointLoc, &backupEndRequired, &backupFromStandby)--> record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true); //回放的起点为backup_label中的检查点--> XLogBeginRead(xlogreader, RecPtr); // Begin reading WAL at 'RecPtr'.--> record = ReadRecord(xlogreader, LOG, true);for (;;){record = XLogReadRecord(xlogreader, &errormsg); // Attempt to read an XLOG record.}--> StartupCLOG();/* REDO */if (InRecovery){UpdateControlFile();CheckRecoveryConsistency();if (checkPoint.redo < RecPtr){/* back up to find the record */XLogBeginRead(xlogreader, checkPoint.redo);record = ReadRecord(xlogreader, PANIC, false);} else {/* just have to read next record after CheckPoint */record = ReadRecord(xlogreader, LOG, false);}if (record != NULL){/* main redo apply loop */do // 回放日志{ // 判断否已达到指定恢复位置,PITR用if (recoveryStopsBefore(xlogreader)){reachedRecoveryTarget = true;break;}/* Now apply the WAL record itself */RmgrTable[record->xl_rmid].rm_redo(xlogreader);}}}
核心函数StartupXLOG
源码分析:
void StartupXLOG(void)
{// .../* Set up XLOG reader facility */MemSet(&private, 0, sizeof(XLogPageReadPrivate));xlogreader = XLogReaderAllocate(wal_segment_size, NULL, XL_ROUTINE(.page_read = &XLogPageRead, .segment_open = NULL, .segment_close = wal_segment_close), &private);// 读backup_label文件if (read_backup_label(&checkPointLoc, &backupEndRequired, &backupFromStandby)){/* Archive recovery was requested, and thanks to the backup label* file, we know how far we need to replay to reach consistency. Enter* archive recovery directly. */InArchiveRecovery = true;/* When a backup_label file is present, we want to roll forward from* the checkpoint it identifies, rather than using pg_control. */record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);if (record != NULL){memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));/* Make sure that REDO location exists. This may not be the case* if there was a crash during an online backup, which left a* backup_label around that references a WAL segment that's already been archived. */if (checkPoint.redo < checkPointLoc){XLogBeginRead(xlogreader, checkPoint.redo);if (!ReadRecord(xlogreader, LOG, false))ereport(FATAL,(errmsg("could not find redo location referenced by checkpoint record"), errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n" "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n" "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", DataDir, DataDir, DataDir)));}}else{ereport(FATAL,(errmsg("could not locate required checkpoint record"),errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n""If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n""Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.", DataDir, DataDir, DataDir)));wasShutdown = false; /* keep compiler quiet */}/* set flag to delete it later */haveBackupLabel = true;}else // 如果没有backup_label文件,则读pg_control文件,在备机恢复的场景中,如果丢失了backup_label文件,而读取了pg_control文件中的检查点,则会因为回放位置不对,无法达成数据一致,恢复失败。{/* Get the last valid checkpoint record. */checkPointLoc = ControlFile->checkPoint;RedoStartLSN = ControlFile->checkPointCopy.redo;record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);if (record != NULL){ereport(DEBUG1,(errmsg_internal("checkpoint record is at %X/%X", LSN_FORMAT_ARGS(checkPointLoc))));}else{/** We used to attempt to go back to a secondary checkpoint record* here, but only when not in standby mode. We now just fail if we* can't read the last checkpoint because this allows us to* simplify processing around checkpoints.*/ereport(PANIC,(errmsg("could not locate a valid checkpoint record")));}memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));}/* REDO */if (InRecovery){/** Set backupStartPoint if we're starting recovery from a base backup.** Also set backupEndPoint and use minRecoveryPoint as the backup end* location if we're starting recovery from a base backup which was* taken from a standby. In this case, the database system status in* pg_control must indicate that the database was already in recovery.* Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be* DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted* before reaching this point; e.g. because restore_command or primary_conninfo were faulty.** Any other state indicates that the backup somehow became corrupted and we can't sensibly continue with recovery.*/if (haveBackupLabel){ControlFile->backupStartPoint = checkPoint.redo; // 从基础备份中恢复ControlFile->backupEndRequired = backupEndRequired;if (backupFromStandby){if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)ereport(FATAL,(errmsg("backup_label contains data inconsistent with control file"),errhint("This means that the backup is corrupted and you will ""have to use another backup for recovery.")));ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;}}UpdateControlFile(); // 更新pg_control,主要是将Backup start location写入/** We're in recovery, so unlogged relations may be trashed and must be* reset. This should be done BEFORE allowing Hot Standby* connections, so that read-only backends don't try to read whatever* garbage is left over from before.*/ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);/* Initialize resource managers */for (rmid = 0; rmid <= RM_MAX_ID; rmid++){if (RmgrTable[rmid].rm_startup != NULL)RmgrTable[rmid].rm_startup();}CheckRecoveryConsistency(); // Checks if recovery has reached a consistent state./** Find the first record that logically follows the checkpoint --- it* might physically precede it, though. */if (checkPoint.redo < RecPtr){/* back up to find the record */XLogBeginRead(xlogreader, checkPoint.redo);record = ReadRecord(xlogreader, PANIC, false);}else{/* just have to read next record after CheckPoint */record = ReadRecord(xlogreader, LOG, false);}if (record != NULL){// 在这里进行实质的日志回放/* main redo apply loop */do{bool switchedTLI = false;// 用于PITR,判断是否已经回放到了指定的Target/* Have we reached our recovery target? */if (recoveryStopsBefore(xlogreader)){reachedRecoveryTarget = true;break;}/* Now apply the WAL record itself */RmgrTable[record->xl_rmid].rm_redo(xlogreader); // 调用standby_redo,xlog_redo,heap_redo,xact_redo等,进行回放,/* Allow read-only connections if we're consistent now */CheckRecoveryConsistency();/* Exit loop if we reached inclusive recovery target */if (recoveryStopsAfter(xlogreader)){reachedRecoveryTarget = true;break;}/* Else, try to fetch the next WAL record */record = ReadRecord(xlogreader, LOG, false); } while (record != NULL); // 直到结束}}/** Determine where to start writing WAL next.** When recovery ended in an incomplete record, write a WAL record about* that and continue after it. In all other cases, re-fetch the last* valid or last applied record, so we can identify the exact endpoint of* what we consider the valid portion of WAL.*/XLogBeginRead(xlogreader, LastRec);record = ReadRecord(xlogreader, PANIC, false);EndOfLog = EndRecPtr;// ...}
一直回放到XLOG_BACKUP_END
,
/** XLOG resource manager's routines** Definitions of info values are in include/catalog/pg_control.h, though* not all record types are related to control file updates.*/
void xlog_redo(XLogReaderState *record)
{uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;XLogRecPtr lsn = record->EndRecPtr;if (info == XLOG_NEXTOID){// ...}else if (info == XLOG_CHECKPOINT_SHUTDOWN){CheckPoint checkPoint;memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));// ...RecoveryRestartPoint(&checkPoint);}else if (info == XLOG_CHECKPOINT_ONLINE){CheckPoint checkPoint;memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));// ...RecoveryRestartPoint(&checkPoint);}else if (info == XLOG_OVERWRITE_CONTRECORD){xl_overwrite_contrecord xlrec;memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));VerifyOverwriteContrecord(&xlrec, record);}else if (info == XLOG_END_OF_RECOVERY){xl_end_of_recovery xlrec;memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));/** For Hot Standby, we could treat this like a Shutdown Checkpoint,* but this case is rarer and harder to test, so the benefit doesn't* outweigh the potential extra cost of maintenance.*//** We should've already switched to the new TLI before replaying this* record.*/if (xlrec.ThisTimeLineID != ThisTimeLineID)ereport(PANIC,(errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",xlrec.ThisTimeLineID, ThisTimeLineID)));}else if (info == XLOG_NOOP){/* nothing to do here */}else if (info == XLOG_SWITCH){/* nothing to do here */}else if (info == XLOG_RESTORE_POINT){/* nothing to do here */}else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT){/** Full-page image (FPI) records contain nothing else but a backup* block (or multiple backup blocks). Every block reference must* include a full-page image - otherwise there would be no point in* this record.** No recovery conflicts are generated by these generic records - if a* resource manager needs to generate conflicts, it has to define a* separate WAL record type and redo routine.** XLOG_FPI_FOR_HINT records are generated when a page needs to be* WAL- logged because of a hint bit update. They are only generated* when checksums are enabled. There is no difference in handling* XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info* code just to distinguish them for statistics purposes.*/for (uint8 block_id = 0; block_id <= record->max_block_id; block_id++){Buffer buffer;if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");UnlockReleaseBuffer(buffer);}}else if (info == XLOG_BACKUP_END) // 回放到这里,结束备份恢复过程{XLogRecPtr startpoint;memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));if (ControlFile->backupStartPoint == startpoint){/** We have reached the end of base backup, the point where* pg_stop_backup() was done. The data on disk is now consistent.* Reset backupStartPoint, and update minRecoveryPoint to make* sure we don't allow starting up at an earlier point even if* recovery is stopped and restarted soon after this.*/elog(DEBUG1, "end of backup reached");LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);if (ControlFile->minRecoveryPoint < lsn){ControlFile->minRecoveryPoint = lsn;ControlFile->minRecoveryPointTLI = ThisTimeLineID;}ControlFile->backupStartPoint = InvalidXLogRecPtr;ControlFile->backupEndRequired = false;UpdateControlFile();LWLockRelease(ControlFileLock);}}else if (info == XLOG_PARAMETER_CHANGE){// ...}else if (info == XLOG_FPW_CHANGE){// ...}
}