【x265编码器】章节4——x265帧间预测流程

  系列文章目录

HEVC视频编解码标准简介

【x265编码器】章节1——lookahead模块分析

【x265编码器】章节2——编码流程及基于x265的编码器demo

【x265编码器】章节3——帧内预测流程

【x265编码器】章节4——帧间预测流程

【x265编码器】章节5——x265帧间运动估计流程

【x265编码器】章节6——x265的码率控制


目录

  系列文章目录

一、帧间预测流程

二、各模块代码分析

1.帧间编码Analysis::compressInterCU_rd0_4

2.帧间merge模式代价计算checkMerge2Nx2N_rd0_4

3.获取merge模式候选列表getInterMergeCandidates

4.运动补偿Predict::motionCompensation

5.skip模式计算率失真代价Search::encodeResAndCalcRdSkipCU

6.merge模式计算率失真代价Search::encodeResAndCalcRdInterCU

7.帧间预测Analysis::checkInter_rd0_4

8.帧间预测Analysis::checkInter_rd0_4

9.帧间预测搜索Search::predInterSearch

10.运动估计MotionEstimate::motionEstimate


一、帧间预测流程

帧间预测流程总体流程如下图,黄色部分即为帧间预测流程的过程,下面流程图也包含了x265其他模块的流程。

二、各模块代码分析

1.帧间编码Analysis::compressInterCU_rd0_4

流程上是:

1.先进行merge和skip模式的RD计算,如果不是skip模式最优,则继续往下执行;

2.跳到步骤1递归继续往下划分,一直划分到最小块8x8;

3.遍历2Nx2N、2NxN、Nx2N、2NxnD、2NxnU、nRx2N、nLx2N、帧间的帧内预测模式,其中矩形和非对称划分,默认配置下,x265不需要遍历;

4.返回最优模式;

代码分析如下:

SplitData Analysis::compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp)
{if (parentCTU.m_vbvAffected && calculateQpforCuSize(parentCTU, cuGeom, 1))return compressInterCU_rd5_6(parentCTU, cuGeom, qp);uint32_t depth = cuGeom.depth;uint32_t cuAddr = parentCTU.m_cuAddr;ModeDepth& md = m_modeDepth[depth];if (m_param->searchMethod == X265_SEA){   //根据 Inter CU 的预测方向(单向或双向),以及参考帧索引数目,将 m_modeDepth[depth].fencYuv.m_integral 数组填充为对应的参考帧的积分图像数据int numPredDir = m_slice->isInterP() ? 1 : 2;int offset = (int)(m_frame->m_reconPic->m_cuOffsetY[parentCTU.m_cuAddr] + m_frame->m_reconPic->m_buOffsetY[cuGeom.absPartIdx]);for (int list = 0; list < numPredDir; list++)for (int i = 0; i < m_frame->m_encData->m_slice->m_numRefIdx[list]; i++)for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)m_modeDepth[depth].fencYuv.m_integral[list][i][planes] = m_frame->m_encData->m_slice->m_refFrameList[list][i]->m_encData->m_meIntegral[planes] + offset;}//用于存储重构图像数据PicYuv& reconPic = *m_frame->m_reconPic;SplitData splitCUData;bool bHEVCBlockAnalysis = (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions > 16);bool bRefineAVCAnalysis = (m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1]));bool bNooffloading = !(m_param->bAnalysisType == AVC_INFO);if (bHEVCBlockAnalysis || bRefineAVCAnalysis || bNooffloading){   //将 md.bestMode 设置为 NULL,用于存储最佳的模式md.bestMode = NULL;bool mightSplit = !(cuGeom.flags & CUGeom::LEAF);bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);uint32_t minDepth = topSkipMinDepth(parentCTU, cuGeom);//使用 topSkipMinDepth 函数计算 minDepth,表示顶部跳过的最小深度bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;bool skipModes = false; //用于控制是否跳过模式分析/* Skip any remaining mode analyses at current depth */bool skipRecursion = false; //用于控制是否跳过递归/* Skip recursion */bool splitIntra = true;bool skipRectAmp = false;bool chooseMerge = false;bool bCtuInfoCheck = false;int sameContentRef = 0;if (m_evaluateInter){if (m_refineLevel == 2){if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)skipModes = true;if (parentCTU.m_partSize[cuGeom.absPartIdx] == SIZE_2Nx2N)skipRectAmp = true;}mightSplit &= false;minDepth = depth;}if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4)m_maxTUDepth = loadTUDepth(cuGeom, parentCTU);//创建了一个名为 splitData 的长度为 4 的 SplitData 数组,并通过调用 initSplitCUData 函数初始化每个元素SplitData splitData[4];splitData[0].initSplitCUData();splitData[1].initSplitCUData();splitData[2].initSplitCUData();splitData[3].initSplitCUData();// avoid uninitialize value in below referenceif (m_param->limitModes){md.pred[PRED_2Nx2N].bestME[0][0].mvCost = 0; // L0md.pred[PRED_2Nx2N].bestME[0][1].mvCost = 0; // L1md.pred[PRED_2Nx2N].sa8dCost = 0;}if (m_param->bCTUInfo && depth <= parentCTU.m_cuDepth[cuGeom.absPartIdx]){if (bDecidedDepth && m_additionalCtuInfo[cuGeom.absPartIdx])sameContentRef = findSameContentRefCount(parentCTU, cuGeom);if (depth < parentCTU.m_cuDepth[cuGeom.absPartIdx]){mightNotSplit &= bDecidedDepth;bCtuInfoCheck = skipRecursion = false;skipModes = true;}else if (mightNotSplit && bDecidedDepth){if (m_additionalCtuInfo[cuGeom.absPartIdx]){bCtuInfoCheck = skipRecursion = true;md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);if (!sameContentRef){if ((m_param->bCTUInfo & 2) && (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth)){qp -= int32_t(0.04 * qp);setLambdaFromQP(parentCTU, qp);}if (m_param->bCTUInfo & 4)skipModes = false;}if (sameContentRef || (!sameContentRef && !(m_param->bCTUInfo & 4))){if (m_param->rdLevel)skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);if ((m_param->bCTUInfo & 4) && sameContentRef)skipModes = md.bestMode && true;}}else{   //进行初始化md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);//对 md.pred[PRED_SKIP] 和 md.pred[PRED_MERGE] 进行 2Nx2N merge模式的分析if (m_param->rdLevel)skipModes = m_param->bEnableEarlySkip && md.bestMode && md.bestMode->cu.isSkipped(0);}mightSplit &= !bDecidedDepth;}}if ((m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10)){if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx]){if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP){md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);skipRecursion = !!m_param->recursionSkipMode && md.bestMode;if (m_param->rdLevel)skipModes = m_param->bEnableEarlySkip && md.bestMode;}if (m_param->analysisLoadReuseLevel > 4 && m_reusePartSize[cuGeom.absPartIdx] == SIZE_2Nx2N){if (m_reuseModes[cuGeom.absPartIdx] != MODE_INTRA  && m_reuseModes[cuGeom.absPartIdx] != 4){skipRectAmp = true && !!md.bestMode;chooseMerge = !!m_reuseMergeFlag[cuGeom.absPartIdx] && !!md.bestMode;}}}}if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU){if (mightNotSplit && depth == m_reuseDepth[cuGeom.absPartIdx]){if (m_reuseModes[cuGeom.absPartIdx] == MODE_SKIP){md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);skipRecursion = !!m_param->recursionSkipMode && md.bestMode;if (m_param->rdLevel)skipModes = m_param->bEnableEarlySkip && md.bestMode;}}}//如果上述没有设置skip模式,则评估可能提前退出的合并模式/* Step 1. Evaluate Merge/Skip candidates for likely early-outs, if skip mode was not set above */if ((mightNotSplit && depth >= minDepth && !md.bestMode && !bCtuInfoCheck) || (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))/* TODO: Re-evaluate if analysis load/save still works */{/* Compute Merge Cost */md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);if (m_param->rdLevel)skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2)&& md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth}//检查是否满足跳过递归的条件。这些条件包括:存在最佳模式(bestMode)、启用递归跳过模式(recursionSkipMode)、没有进行CTU信息检查(bCtuInfoCheck)以及不满足特定的分析类型条件if (md.bestMode && m_param->recursionSkipMode && !bCtuInfoCheck && !(m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1]))){skipRecursion = md.bestMode->cu.isSkipped(0);if (mightSplit && !skipRecursion)//如果满足可能分割的条件(mightSplit)且不应跳过递归(!skipRecursion),则根据不同的情况进行判断{   //如果深度达到最小深度(minDepth)且递归跳过模式为基于RDCOST的跳过(RDCOST_BASED_RSKIP),则根据递归深度检查(recursionDepthCheck)和复杂度检查(complexityCheckCU)来确定是否应该跳过递归if (depth >= minDepth && m_param->recursionSkipMode == RDCOST_BASED_RSKIP){if (depth)skipRecursion = recursionDepthCheck(parentCTU, cuGeom, *md.bestMode);if (m_bHD && !skipRecursion && m_param->rdLevel == 2 && md.fencYuv.m_size != MAX_CU_SIZE)skipRecursion = complexityCheckCU(*md.bestMode);}//如果CU的CU尺寸(log2CUSize)大于等于最大CU尺寸减1(MAX_LOG2_CU_SIZE - 1)且递归跳过模式为基于边缘的跳过(EDGE_BASED_RSKIP),则根据复杂度检查来确定是否应该跳过递归else if (cuGeom.log2CUSize >= MAX_LOG2_CU_SIZE - 1 && m_param->recursionSkipMode == EDGE_BASED_RSKIP){skipRecursion = complexityCheckCU(*md.bestMode);}}}//如果分析类型为AVC_INFO、存在最佳模式(bestMode)、CU分区数小于等于16,并且分析加载重用级别为7,这些条件都满足,将设置跳过递归(skipRecursion)为trueif (m_param->bAnalysisType == AVC_INFO && md.bestMode && cuGeom.numPartitions <= 16 && m_param->analysisLoadReuseLevel == 7)skipRecursion = true;/* Step 2. Evaluate each of the 4 split sub-blocks in series */if (mightSplit && !skipRecursion){   //初始化分割预测模式(splitPred)和分割CU数据(splitCU)if (bCtuInfoCheck && m_param->bCTUInfo & 2)qp = int((1 / 0.96) * qp + 0.5);Mode* splitPred = &md.pred[PRED_SPLIT];splitPred->initCosts();CUData* splitCU = &splitPred->cu;splitCU->initSubCU(parentCTU, cuGeom, qp);//初始化下一个深度(nextDepth)和对应的模式深度(nd)uint32_t nextDepth = depth + 1;ModeDepth& nd = m_modeDepth[nextDepth];invalidateContexts(nextDepth);Entropy* nextContext = &m_rqt[depth].cur;int nextQP = qp;//根据情况更新下一个QP(nextQP)splitIntra = false;//进行子块的压缩编码(compressInterCU_rd0_4),并保存相应的数据for (uint32_t subPartIdx = 0; subPartIdx < 4; subPartIdx++){const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);if (childGeom.flags & CUGeom::PRESENT){m_modeDepth[0].fencYuv.copyPartToYuv(nd.fencYuv, childGeom.absPartIdx);m_rqt[nextDepth].cur.load(*nextContext);if (m_slice->m_pps->bUseDQP && nextDepth <= m_slice->m_pps->maxCuDQPDepth)nextQP = setLambdaFromQP(parentCTU, calculateQpforCuSize(parentCTU, childGeom));splitData[subPartIdx] = compressInterCU_rd0_4(parentCTU, childGeom, nextQP);//判断子块最佳模式是否为内部预测(intra),并更新splitIntra标志,将最佳模式的数据复制到相应的子块CU和预测模式(splitCU和splitPred)中// Save best CU and pred data for this sub CUsplitIntra |= nd.bestMode->cu.isIntra(0);splitCU->copyPartFrom(nd.bestMode->cu, childGeom, subPartIdx);splitPred->addSubCosts(*nd.bestMode);//根据编码参数(m_param)和RD级别(rdLevel)更新上下文(nextContext)if (m_param->rdLevel)nd.bestMode->reconYuv.copyToPartYuv(splitPred->reconYuv, childGeom.numPartitions * subPartIdx);elsend.bestMode->predYuv.copyToPartYuv(splitPred->predYuv, childGeom.numPartitions * subPartIdx);if (m_param->rdLevel > 1)nextContext = &nd.bestMode->contexts;}else//如果子块不存在(flags & CUGeom::PRESENT),则设置子块为空splitCU->setEmptyPart(childGeom, subPartIdx);}//循环结束后,存储最终的上下文(nextContext)到分割预测模式(splitPred)中nextContext->store(splitPred->contexts);//后,根据可能不进行分割(mightNotSplit)和RD级别(rdLevel)的情况,分别计算分割标志的成本(addSplitFlagCost)或更新模式成本(updateModeCost)if (mightNotSplit)addSplitFlagCost(*splitPred, cuGeom.depth);else if (m_param->rdLevel > 1)updateModeCost(*splitPred);elsesplitPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)splitPred->distortion, splitPred->sa8dBits);}/* If analysis mode is simple do not Evaluate other modes */if (m_param->bAnalysisType == AVC_INFO && m_param->analysisLoadReuseLevel == 7){if (m_slice->m_sliceType == P_SLICE){if (m_checkMergeAndSkipOnly[0])skipModes = true;}else{if (m_checkMergeAndSkipOnly[0] && m_checkMergeAndSkipOnly[1])skipModes = true;}}/* Split CUs*   0  1*   2  3 *///定义了一个名为allSplitRefs的变量,它存储了四个子块的分割参考值(splitRefs)。这些参考值是由之前的代码段计算得到的uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;/* Step 3. Evaluate ME (2Nx2N, rect, amp) and intra modes at current depth */if (mightNotSplit && (depth >= minDepth || (m_param->bCTUInfo && !md.bestMode))){   //如果使用了DQP(bUseDQP)并且当前深度小于等于最大CU DQP深度(maxCuDQPDepth),同时最大CU DQP深度不为0,则根据父CTU和QP设置Lambdaif (m_slice->m_pps->bUseDQP && depth <= m_slice->m_pps->maxCuDQPDepth && m_slice->m_pps->maxCuDQPDepth != 0)setLambdaFromQP(parentCTU, qp);if (!skipModes){   //定义了一个名为refMasks的数组,用于存储分割参考值,使用checkInter_rd0_4函数对2Nx2N模式进行评估,计算运动矢量成本和其他相关数据uint32_t refMasks[2];refMasks[0] = allSplitRefs;//初始化子块CUmd.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);checkInter_rd0_4(md.pred[PRED_2Nx2N], cuGeom, SIZE_2Nx2N, refMasks);//如果限制参考图像数(limitReferences)的条件为真,将最佳参考索引(refMask)赋值给所有子块的分割参考值if (m_param->limitReferences & X265_REF_LIMIT_CU){CUData& cu = md.pred[PRED_2Nx2N].cu;uint32_t refMask = cu.getBestRefIdx(0);allSplitRefs = splitData[0].splitRefs = splitData[1].splitRefs = splitData[2].splitRefs = splitData[3].splitRefs = refMask;}//如果切片类型为B_SLICEif (m_slice->m_sliceType == B_SLICE){   //初始化双向预测模式的子块CU(md.pred[PRED_BIDIR].cu),对双向预测模式进行评估,计算运动矢量成本和其他相关数据md.pred[PRED_BIDIR].cu.initSubCU(parentCTU, cuGeom, qp);checkBidir2Nx2N(md.pred[PRED_2Nx2N], md.pred[PRED_BIDIR], cuGeom);}//将bestInter指针指向md.pred[PRED_2Nx2N]Mode *bestInter = &md.pred[PRED_2Nx2N];if (!skipRectAmp){if (m_param->bEnableRectInter){   //计算分割成本(splitCost)uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;uint32_t threshold_2NxN, threshold_Nx2N;//根据切片类型设置2NxN和Nx2N的阈值if (m_slice->m_sliceType == P_SLICE){threshold_2NxN = splitData[0].mvCost[0] + splitData[1].mvCost[0];threshold_Nx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];}else{threshold_2NxN = (splitData[0].mvCost[0] + splitData[1].mvCost[0]+ splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;threshold_Nx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]+ splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;}//如果2NxN优先且splitCost小于md.pred[PRED_2Nx2N]的成本加上threshold_2NxNint try_2NxN_first = threshold_2NxN < threshold_Nx2N;if (try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN){   //更新参考掩码refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);//初始化子块CUcheckInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)bestInter = &md.pred[PRED_2NxN];//如果md.pred[PRED_2NxN]的sa8dCost小于bestInter的sa8dCost,则将bestInter指针指向md.pred[PRED_2NxN]}//同上,Nx2N模式if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_Nx2N){refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* left */refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* right */md.pred[PRED_Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);checkInter_rd0_4(md.pred[PRED_Nx2N], cuGeom, SIZE_Nx2N, refMasks);if (md.pred[PRED_Nx2N].sa8dCost < bestInter->sa8dCost)bestInter = &md.pred[PRED_Nx2N];}//同上if (!try_2NxN_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxN){refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* top */refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* bot */md.pred[PRED_2NxN].cu.initSubCU(parentCTU, cuGeom, qp);checkInter_rd0_4(md.pred[PRED_2NxN], cuGeom, SIZE_2NxN, refMasks);if (md.pred[PRED_2NxN].sa8dCost < bestInter->sa8dCost)bestInter = &md.pred[PRED_2NxN];}}//于判断当前深度是否可以进行CU分割if (m_slice->m_sps->maxAMPDepth > depth){   //计算四个子块的SA8D成本uint64_t splitCost = splitData[0].sa8dCost + splitData[1].sa8dCost + splitData[2].sa8dCost + splitData[3].sa8dCost;uint32_t threshold_2NxnU, threshold_2NxnD, threshold_nLx2N, threshold_nRx2N;//根据切片类型(m_slice->m_sliceType)设置不同的阈值(threshold)if (m_slice->m_sliceType == P_SLICE){threshold_2NxnU = splitData[0].mvCost[0] + splitData[1].mvCost[0];threshold_2NxnD = splitData[2].mvCost[0] + splitData[3].mvCost[0];threshold_nLx2N = splitData[0].mvCost[0] + splitData[2].mvCost[0];threshold_nRx2N = splitData[1].mvCost[0] + splitData[3].mvCost[0];}else{threshold_2NxnU = (splitData[0].mvCost[0] + splitData[1].mvCost[0]+ splitData[0].mvCost[1] + splitData[1].mvCost[1] + 1) >> 1;threshold_2NxnD = (splitData[2].mvCost[0] + splitData[3].mvCost[0]+ splitData[2].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;threshold_nLx2N = (splitData[0].mvCost[0] + splitData[2].mvCost[0]+ splitData[0].mvCost[1] + splitData[2].mvCost[1] + 1) >> 1;threshold_nRx2N = (splitData[1].mvCost[0] + splitData[3].mvCost[0]+ splitData[1].mvCost[1] + splitData[3].mvCost[1] + 1) >> 1;}//据最佳帧间预测模式(bestInter)的CU分割大小(bestInter->cu.m_partSize[0])判断是否进行水平和垂直方向的分割bool bHor = false, bVer = false;if (bestInter->cu.m_partSize[0] == SIZE_2NxN)bHor = true;//如果CU分割大小是SIZE_2NxN,则进行水平方向的分割(bHor设置为true)else if (bestInter->cu.m_partSize[0] == SIZE_Nx2N)bVer = true;//如果CU分割大小是SIZE_Nx2N,则进行垂直方向的分割(bVer设置为true)else if (bestInter->cu.m_partSize[0] == SIZE_2Nx2N &&md.bestMode && md.bestMode->cu.getQtRootCbf(0)){bHor = true;bVer = true;}if (bHor){int try_2NxnD_first = threshold_2NxnD < threshold_2NxnU;if (try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD){refMasks[0] = allSplitRefs;                                    /* 75% top */refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)bestInter = &md.pred[PRED_2NxnD];}if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnU){refMasks[0] = splitData[0].splitRefs | splitData[1].splitRefs; /* 25% top */refMasks[1] = allSplitRefs;                                    /* 75% bot */md.pred[PRED_2NxnU].cu.initSubCU(parentCTU, cuGeom, qp);checkInter_rd0_4(md.pred[PRED_2NxnU], cuGeom, SIZE_2NxnU, refMasks);if (md.pred[PRED_2NxnU].sa8dCost < bestInter->sa8dCost)bestInter = &md.pred[PRED_2NxnU];}if (!try_2NxnD_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_2NxnD){refMasks[0] = allSplitRefs;                                    /* 75% top */refMasks[1] = splitData[2].splitRefs | splitData[3].splitRefs; /* 25% bot */md.pred[PRED_2NxnD].cu.initSubCU(parentCTU, cuGeom, qp);checkInter_rd0_4(md.pred[PRED_2NxnD], cuGeom, SIZE_2NxnD, refMasks);if (md.pred[PRED_2NxnD].sa8dCost < bestInter->sa8dCost)bestInter = &md.pred[PRED_2NxnD];}}if (bVer){int try_nRx2N_first = threshold_nRx2N < threshold_nLx2N;if (try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N){refMasks[0] = allSplitRefs;                                    /* 75% left  */refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)bestInter = &md.pred[PRED_nRx2N];}if (splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nLx2N){refMasks[0] = splitData[0].splitRefs | splitData[2].splitRefs; /* 25% left  */refMasks[1] = allSplitRefs;                                    /* 75% right */md.pred[PRED_nLx2N].cu.initSubCU(parentCTU, cuGeom, qp);checkInter_rd0_4(md.pred[PRED_nLx2N], cuGeom, SIZE_nLx2N, refMasks);if (md.pred[PRED_nLx2N].sa8dCost < bestInter->sa8dCost)bestInter = &md.pred[PRED_nLx2N];}if (!try_nRx2N_first && splitCost < md.pred[PRED_2Nx2N].sa8dCost + threshold_nRx2N){refMasks[0] = allSplitRefs;                                    /* 75% left  */refMasks[1] = splitData[1].splitRefs | splitData[3].splitRefs; /* 25% right */md.pred[PRED_nRx2N].cu.initSubCU(parentCTU, cuGeom, qp);checkInter_rd0_4(md.pred[PRED_nRx2N], cuGeom, SIZE_nRx2N, refMasks);if (md.pred[PRED_nRx2N].sa8dCost < bestInter->sa8dCost)bestInter = &md.pred[PRED_nRx2N];}}}}//该条件表示是否尝试进行帧内预测,不为B帧,不为64x64bool bTryIntra = (m_slice->m_sliceType != B_SLICE || m_param->bIntraInBFrames) && cuGeom.log2CUSize != MAX_LOG2_CU_SIZE && !((m_param->bCTUInfo & 4) && bCtuInfoCheck);if (m_param->rdLevel >= 3){    //判断是否需要进行帧间预测的亮度分量/* Calculate RD cost of best inter option */if ((!m_bChromaSa8d && (m_csp != X265_CSP_I400)) || (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)) /* When m_bChromaSa8d is enabled, chroma MC has already been done */{uint32_t numPU = bestInter->cu.getNumPartInter(0);for (uint32_t puIdx = 0; puIdx < numPU; puIdx++){   //则对bestInter中的每个PU(预测单元)进行运动补偿PredictionUnit pu(bestInter->cu, cuGeom, puIdx);motionCompensation(bestInter->cu, pu, bestInter->predYuv, false, true);}}//代码判断是否选择合并模式(chooseMerge)if (!chooseMerge){   //计算bestInter的残差并计算RD成本encodeResAndCalcRdInterCU(*bestInter, cuGeom);checkBestMode(*bestInter, depth);//然后调用checkBestMode函数更新最佳模式,如果选择合并模式,则跳过这部分处理//代码判断是否存在双向预测模式(BIDIR)并且双向预测的SA8D成本在最佳帧间预测的17/16以内/* If BIDIR is available and within 17/16 of best inter option, choose by RDO */if (m_slice->m_sliceType == B_SLICE && md.pred[PRED_BIDIR].sa8dCost != MAX_INT64 &&md.pred[PRED_BIDIR].sa8dCost * 16 <= bestInter->sa8dCost * 17){   //每个PU进行运动补偿,并计算残差和RD成本。然后调用checkBestMode函数更新最佳模式uint32_t numPU = md.pred[PRED_BIDIR].cu.getNumPartInter(0);if (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)for (uint32_t puIdx = 0; puIdx < numPU; puIdx++){PredictionUnit pu(md.pred[PRED_BIDIR].cu, cuGeom, puIdx);motionCompensation(md.pred[PRED_BIDIR].cu, pu, md.pred[PRED_BIDIR].predYuv, true, true);}encodeResAndCalcRdInterCU(md.pred[PRED_BIDIR], cuGeom);checkBestMode(md.pred[PRED_BIDIR], depth);}}//代码判断是否尝试进行帧内预测(bTryIntra为true)并且最佳帧间预测的SA8D成本为最大值(表示没有可行的帧间预测)if ((bTryIntra && md.bestMode->cu.getQtRootCbf(0)) ||md.bestMode->sa8dCost == MAX_INT64){   //如果不限制参考帧数或者进行了CU分割(splitIntra),则调用initSubCU函数初始化md.pred[PRED_INTRA]的子CU,并进行帧内帧间混合预测的处理(checkIntraInInter、encodeIntraInInter),最后调用checkBestMode函数更新最佳模式(md.pred[PRED_INTRA]if (!m_param->limitReferences || splitIntra){ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);checkIntraInInter(md.pred[PRED_INTRA], cuGeom);encodeIntraInInter(md.pred[PRED_INTRA], cuGeom);checkBestMode(md.pred[PRED_INTRA], depth);}else{ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);}}}else{/* SA8D choice between merge/skip, inter, bidir, and intra */if (!md.bestMode || bestInter->sa8dCost < md.bestMode->sa8dCost)md.bestMode = bestInter;if (m_slice->m_sliceType == B_SLICE &&md.pred[PRED_BIDIR].sa8dCost < md.bestMode->sa8dCost)md.bestMode = &md.pred[PRED_BIDIR];if (bTryIntra || md.bestMode->sa8dCost == MAX_INT64){if (!m_param->limitReferences || splitIntra){ProfileCounter(parentCTU, totalIntraCU[cuGeom.depth]);md.pred[PRED_INTRA].cu.initSubCU(parentCTU, cuGeom, qp);checkIntraInInter(md.pred[PRED_INTRA], cuGeom);if (md.pred[PRED_INTRA].sa8dCost < md.bestMode->sa8dCost)md.bestMode = &md.pred[PRED_INTRA];}else{ProfileCounter(parentCTU, skippedIntraCU[cuGeom.depth]);}}/* finally code the best mode selected by SA8D costs:* RD level 2 - fully encode the best mode* RD level 1 - generate recon pixels* RD level 0 - generate chroma prediction */if (md.bestMode->cu.m_mergeFlag[0] && md.bestMode->cu.m_partSize[0] == SIZE_2Nx2N){/* prediction already generated for this CU, and if rd level* is not 0, it is already fully encoded */}else if (md.bestMode->cu.isInter(0)){uint32_t numPU = md.bestMode->cu.getNumPartInter(0);if (m_csp != X265_CSP_I400){for (uint32_t puIdx = 0; puIdx < numPU; puIdx++){PredictionUnit pu(md.bestMode->cu, cuGeom, puIdx);motionCompensation(md.bestMode->cu, pu, md.bestMode->predYuv, false, true);}}if (m_param->rdLevel == 2)encodeResAndCalcRdInterCU(*md.bestMode, cuGeom);else if (m_param->rdLevel == 1){/* generate recon pixels with no rate distortion considerations */CUData& cu = md.bestMode->cu;uint32_t tuDepthRange[2];cu.getInterTUQtDepthRange(tuDepthRange, 0);m_rqt[cuGeom.depth].tmpResiYuv.subtract(*md.bestMode->fencYuv, md.bestMode->predYuv, cuGeom.log2CUSize, m_frame->m_fencPic->m_picCsp);residualTransformQuantInter(*md.bestMode, cuGeom, 0, 0, tuDepthRange);if (cu.getQtRootCbf(0))md.bestMode->reconYuv.addClip(md.bestMode->predYuv, m_rqt[cuGeom.depth].tmpResiYuv, cu.m_log2CUSize[0], m_frame->m_fencPic->m_picCsp);else{md.bestMode->reconYuv.copyFromYuv(md.bestMode->predYuv);if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N)cu.setPredModeSubParts(MODE_SKIP);}}}else{if (m_param->rdLevel == 2)encodeIntraInInter(*md.bestMode, cuGeom);else if (m_param->rdLevel == 1){/* generate recon pixels with no rate distortion considerations */CUData& cu = md.bestMode->cu;uint32_t tuDepthRange[2];cu.getIntraTUQtDepthRange(tuDepthRange, 0);residualTransformQuantIntra(*md.bestMode, cuGeom, 0, 0, tuDepthRange);if (m_csp != X265_CSP_I400){getBestIntraModeChroma(*md.bestMode, cuGeom);residualQTIntraChroma(*md.bestMode, cuGeom, 0, 0);}md.bestMode->reconYuv.copyFromPicYuv(reconPic, cu.m_cuAddr, cuGeom.absPartIdx); // TODO:}}}} // !earlyskipif (m_bTryLossless)tryLossless(cuGeom);if (mightSplit)addSplitFlagCost(*md.bestMode, cuGeom.depth);}//表示可能进行CU分割,并且没有跳过递归if (mightSplit && !skipRecursion){Mode* splitPred = &md.pred[PRED_SPLIT];if (!md.bestMode)md.bestMode = splitPred;else if (m_param->rdLevel > 1)//比较分割模式和最优模式checkBestMode(*splitPred, cuGeom.depth);else if (splitPred->sa8dCost < md.bestMode->sa8dCost)md.bestMode = splitPred;checkDQPForSplitPred(*md.bestMode, cuGeom);}//初始化splitCUData/* determine which motion references the parent CU should search */splitCUData.initSplitCUData();if (m_param->limitReferences & X265_REF_LIMIT_DEPTH){   //表示分割模式是最佳模式,将splitRefs设置为allSplitRefsif (md.bestMode == &md.pred[PRED_SPLIT])splitCUData.splitRefs = allSplitRefs;else{   //根据最佳合并/帧间模式确定参考帧,如果是帧内预测,使用2Nx2N帧间参考帧,遍历每个PU,将其最佳参考帧索引添加到splitCUData.splitRefs中/* use best merge/inter mode, in case of intra use 2Nx2N inter references */CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;uint32_t numPU = cu.getNumPartInter(0);for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);}}if (m_param->limitModes){splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1splitCUData.sa8dCost = md.pred[PRED_2Nx2N].sa8dCost;}//如果满足条件mightNotSplit并且最佳模式的CU被跳过(被标记为跳过),则对当前CTU的编码统计信息进行更新if (mightNotSplit && md.bestMode->cu.isSkipped(0)){FrameData& curEncData = *m_frame->m_encData;FrameData::RCStatCU& cuStat = curEncData.m_cuStat[parentCTU.m_cuAddr];uint64_t temp = cuStat.avgCost[depth] * cuStat.count[depth];cuStat.count[depth] += 1;cuStat.avgCost[depth] = (temp + md.bestMode->rdCost) / cuStat.count[depth];}//将最佳模式的CU数据和重建图像数据拷贝到encData和reconPic中/* Copy best data to encData CTU and recon */md.bestMode->cu.copyToPic(depth);if (m_param->rdLevel)md.bestMode->reconYuv.copyToPicYuv(reconPic, cuAddr, cuGeom.absPartIdx);if ((m_limitTU & X265_TU_LIMIT_NEIGH) && cuGeom.log2CUSize >= 4){if (mightNotSplit){   //如果mightNotSplit为true,获取最佳模式的CU的encData中的CTU数据,并对其进行更新,记录最大的TU深度CUData* ctu = md.bestMode->cu.m_encData->getPicCTU(parentCTU.m_cuAddr);int8_t maxTUDepth = -1;for (uint32_t i = 0; i < cuGeom.numPartitions; i++)maxTUDepth = X265_MAX(maxTUDepth, md.bestMode->cu.m_tuDepth[i]);ctu->m_refTuDepth[cuGeom.geomRecurId] = maxTUDepth;}}}else{if (m_param->bAnalysisType == AVC_INFO && cuGeom.numPartitions <= 16){qprdRefine(parentCTU, cuGeom, qp, qp);SplitData splitData[4];splitData[0].initSplitCUData();splitData[1].initSplitCUData();splitData[2].initSplitCUData();splitData[3].initSplitCUData();uint32_t allSplitRefs = splitData[0].splitRefs | splitData[1].splitRefs | splitData[2].splitRefs | splitData[3].splitRefs;splitCUData.initSplitCUData();if (m_param->limitReferences & X265_REF_LIMIT_DEPTH){if (md.bestMode == &md.pred[PRED_SPLIT])splitCUData.splitRefs = allSplitRefs;else{/* use best merge/inter mode, in case of intra use 2Nx2N inter references */CUData& cu = md.bestMode->cu.isIntra(0) ? md.pred[PRED_2Nx2N].cu : md.bestMode->cu;uint32_t numPU = cu.getNumPartInter(0);for (uint32_t puIdx = 0, subPartIdx = 0; puIdx < numPU; puIdx++, subPartIdx += cu.getPUOffset(puIdx, 0))splitCUData.splitRefs |= cu.getBestRefIdx(subPartIdx);}}if (m_param->limitModes){splitCUData.mvCost[0] = md.pred[PRED_2Nx2N].bestME[0][0].mvCost; // L0splitCUData.mvCost[1] = md.pred[PRED_2Nx2N].bestME[0][1].mvCost; // L1splitCUData.sa8dCost = md.pred[PRED_2Nx2N].sa8dCost;}}}return splitCUData;
}

2.帧间merge模式代价计算checkMerge2Nx2N_rd0_4

流程上是:

1.选取3个候选项;(x265默认配置参数,常规应该为5个)

2.遍历进行运动补偿(仅做亮度),根据sa8d计算RD,选取最优的候选项;

3.对最优候选项追加色度的运动补偿;

4.计算skip模式的RD,此时会涉及计算残差、熵编码、计算bit数、得到最后的RD值;

5.类似上一步,计算merge模式的RD值;

6.对比skip模式和merge模式,取最优的模式并返回。代码分析如下:

//如果找到有效的合并候选项,则设置md.bestMode,否则为NULL
/* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
void Analysis::checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom)
{   //获取 cuGeom 的深度uint32_t depth = cuGeom.depth;ModeDepth& md = m_modeDepth[depth];Yuv *fencYuv = &md.fencYuv;//初始化两个 Mode 实例,命名为 tempPred 和 bestPred,并将它们分别指向 merge 和 skip/* Note that these two Mode instances are named MERGE and SKIP but they may* hold the reverse when the function returns. We toggle between the two modes */Mode* tempPred = &merge;Mode* bestPred = &skip;X265_CHECK(m_slice->m_sliceType != I_SLICE, "Evaluating merge in I slice\n");tempPred->initCosts();tempPred->cu.setPartSizeSubParts(SIZE_2Nx2N);tempPred->cu.setPredModeSubParts(MODE_INTER);tempPred->cu.m_mergeFlag[0] = true;bestPred->initCosts();bestPred->cu.setPartSizeSubParts(SIZE_2Nx2N);bestPred->cu.setPredModeSubParts(MODE_INTER);bestPred->cu.m_mergeFlag[0] = true;//用于存储merge候选项的运动矢量MVField candMvField[MRG_MAX_NUM_CANDS][2]; // double length for mv of both listsuint8_t candDir[MRG_MAX_NUM_CANDS];//用于存储合并候选项的方向uint32_t numMergeCand = tempPred->cu.getInterMergeCandidates(0, 0, candMvField, candDir);PredictionUnit pu(merge.cu, cuGeom, 0);bestPred->sa8dCost = MAX_INT64;int bestSadCand = -1;int sizeIdx = cuGeom.log2CUSize - 2;int safeX, maxSafeMv;if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE){safeX = m_slice->m_refFrameList[0][0]->m_encData->m_pir.pirEndCol * m_param->maxCUSize - 3;maxSafeMv = (safeX - tempPred->cu.m_cuPelX) * 4;}for (uint32_t i = 0; i < numMergeCand; ++i){   //启用了并行处理(m_bFrameParallel),则进行并行切片的边界检查if (m_bFrameParallel){// Parallel slices bound checkif (m_param->maxSlices > 1){// NOTE: First row in slice can't negativeif (X265_MIN(candMvField[i][0].mv.y, candMvField[i][1].mv.y) < m_sliceMinY)continue;// Last row in slice can't reference beyond bound since it is another slice area// TODO: we may beyond bound in future since these area have a chance to finish because we use parallel slices. Necessary prepare research on load balanceif (X265_MAX(candMvField[i][0].mv.y, candMvField[i][1].mv.y) > m_sliceMaxY)continue;}if (candMvField[i][0].mv.y >= (m_param->searchRange + 1) * 4 ||candMvField[i][1].mv.y >= (m_param->searchRange + 1) * 4)continue;}if (m_param->bIntraRefresh && m_slice->m_sliceType == P_SLICE &&tempPred->cu.m_cuPelX / m_param->maxCUSize < m_frame->m_encData->m_pir.pirEndCol &&candMvField[i][0].mv.x > maxSafeMv)// skip merge candidates which reference beyond safe reference areacontinue;//将当前候选项的相关信息赋值给 tempPred->cu 结构体的相应成员变量tempPred->cu.m_mvpIdx[0][0] = (uint8_t)i; // merge candidate ID is stored in L0 MVP idxX265_CHECK(m_slice->m_sliceType == B_SLICE || !(candDir[i] & 0x10), " invalid merge for P slice\n");tempPred->cu.m_interDir[0] = candDir[i];tempPred->cu.m_mv[0][0] = candMvField[i][0].mv;tempPred->cu.m_mv[1][0] = candMvField[i][1].mv;tempPred->cu.m_refIdx[0][0] = (int8_t)candMvField[i][0].refIdx;tempPred->cu.m_refIdx[1][0] = (int8_t)candMvField[i][1].refIdx;//执行运动补偿,根据候选项的运动矢量对 tempPred->predYuv 进行预测motionCompensation(tempPred->cu, pu, tempPred->predYuv, true, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));//当前候选项的比特数,当前候选项的失真tempPred->sa8dBits = getTUBits(i, numMergeCand);tempPred->distortion = primitives.cu[sizeIdx].sa8d(fencYuv->m_buf[0], fencYuv->m_size, tempPred->predYuv.m_buf[0], tempPred->predYuv.m_size);if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)){   //计算色度平面的 SA8D 失真tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[1], fencYuv->m_csize, tempPred->predYuv.m_buf[1], tempPred->predYuv.m_csize);tempPred->distortion += primitives.chroma[m_csp].cu[sizeIdx].sa8d(fencYuv->m_buf[2], fencYuv->m_csize, tempPred->predYuv.m_buf[2], tempPred->predYuv.m_csize);}//当前候选项的 RD 代价,将失真值和比特数作为参数tempPred->sa8dCost = m_rdCost.calcRdSADCost((uint32_t)tempPred->distortion, tempPred->sa8dBits);//如果 tempPred->sa8dCost 小于 bestPred->sa8dCost,则更新 bestSadCand 的值为当前候选项的索引 i,并交换 tempPred 和 bestPredif (tempPred->sa8dCost < bestPred->sa8dCost){   //通过上述循环迭代,找到了 RD 代价最小的候选项,并将其索引存储在 bestSadCand 中bestSadCand = i;std::swap(tempPred, bestPred);}}//首先,代码检查bestSadCand是否小于0,如果是,则强制选择帧内编码或帧间编码。如果是帧内编码,直接返回/* force mode decision to take inter or intra */if (bestSadCand < 0)return;//接下来,对于选择的最佳模式,计算其色度通道的运动补偿。这一步是为了对色度通道进行运动补偿,以提高编码效率/* calculate the motion compensation for chroma for the best mode selected */if ((!m_bChromaSa8d && (m_csp != X265_CSP_I400)) || (m_frame->m_fencPic->m_picCsp == X265_CSP_I400 && m_csp != X265_CSP_I400)) /* Chroma MC was done above */motionCompensation(bestPred->cu, pu, bestPred->predYuv, false, true);//如果设置了rdLevel(率失真优化级别),默认为3if (m_param->rdLevel){   // 检查是否设置了无损编码(bLossless),如果是,则将bestPred的rdCost设置为最大值if (m_param->bLossless)bestPred->rdCost = MAX_INT64;else//对bestPred进行编码,包括编码残差并计算率失真(RdSkip)的代价encodeResAndCalcRdSkipCU(*bestPred);/* Encode with residual 使用残差进行编码*///将tempPred的一些参数设置为与bestPred相同tempPred->cu.m_mvpIdx[0][0] = (uint8_t)bestSadCand;tempPred->cu.setPUInterDir(candDir[bestSadCand], 0, 0);tempPred->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0);tempPred->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0);tempPred->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0);tempPred->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0);tempPred->sa8dCost = bestPred->sa8dCost;tempPred->sa8dBits = bestPred->sa8dBits;tempPred->predYuv.copyFromYuv(bestPred->predYuv);//使用encodeResAndCalcRdInterCU函数对tempPred进行编码和率失真计算encodeResAndCalcRdInterCU(*tempPred, cuGeom);//根据编码结果的率失真代价(rdCost)比较,选择最佳模式md.bestMode = tempPred->rdCost < bestPred->rdCost ? tempPred : bestPred;}elsemd.bestMode = bestPred;//将最佳模式的运动矢量和参考图像索引广播给其他部分/* broadcast sets of MV field data */md.bestMode->cu.setPUInterDir(candDir[bestSadCand], 0, 0);md.bestMode->cu.setPUMv(0, candMvField[bestSadCand][0].mv, 0, 0);md.bestMode->cu.setPUMv(1, candMvField[bestSadCand][1].mv, 0, 0);md.bestMode->cu.setPURefIdx(0, (int8_t)candMvField[bestSadCand][0].refIdx, 0, 0);md.bestMode->cu.setPURefIdx(1, (int8_t)candMvField[bestSadCand][1].refIdx, 0, 0);checkDQP(*md.bestMode, cuGeom);//调用checkDQP函数对最佳模式进行DQP(差分量化参数)检查
}

3.获取merge模式候选列表getInterMergeCandidates

函数功能是获取merge模式候选项,原理可以参考HEVC视频编解码标准简介中的merge模式流程,区别在于,x265默认配置下,只会获取3个候选项,而不是常规的5个候选项;

uint32_t CUData::getInterMergeCandidates(uint32_t absPartIdx, uint32_t puIdx, MVField(*candMvField)[2], uint8_t* candDir) const
{uint32_t absPartAddr = m_absIdxInCTU + absPartIdx;const bool isInterB = m_slice->isInterB();//检查当前切片是否为 InterB 类型//获取最大合并候选项数量const uint32_t maxNumMergeCand = m_slice->m_maxNumMergeCand;//使用循环遍历合并候选项数组 candMvField,对其进行初始化。将每个候选项的运动矢量设置为零,参考索引设置为 REF_NOT_VALIDfor (uint32_t i = 0; i < maxNumMergeCand; ++i){candMvField[i][0].mv = 0;candMvField[i][1].mv = 0;candMvField[i][0].refIdx = REF_NOT_VALID;candMvField[i][1].refIdx = REF_NOT_VALID;}//计算当前 PU 的左上角像素位置和尺寸/* calculate the location of upper-left corner pixel and size of the current PU */int xP, yP, nPSW, nPSH;int cuSize = 1 << m_log2CUSize[0];int partMode = m_partSize[0];//通过查找预定义的分区表 partTable,根据当前 PU 的分区模式和 puIdx 获取尺寸信息int tmp = partTable[partMode][puIdx][0];nPSW = ((tmp >> 4) * cuSize) >> 2;//计算 nPSW 和 nPSH,分别表示当前 PU 的宽度和高度nPSH = ((tmp & 0xF) * cuSize) >> 2;//分别表示当前 PU 的左上角像素的 x 坐标和 y 坐标tmp = partTable[partMode][puIdx][1];xP = ((tmp >> 4) * cuSize) >> 2;yP = ((tmp & 0xF) * cuSize) >> 2;//初始化计数器 count 为 0uint32_t count = 0;//获取左下侧候选项uint32_t partIdxLT, partIdxRT, partIdxLB = deriveLeftBottomIdx(puIdx);PartSize curPS = (PartSize)m_partSize[absPartIdx];// left 获取左侧候选项uint32_t leftPartIdx = 0;const CUData* cuLeft = getPULeft(leftPartIdx, partIdxLB);bool isAvailableA1 = cuLeft &&//判断左侧 PU 是否可用,并满足一些特定条件cuLeft->isDiffMER(xP - 1, yP + nPSH - 1, xP, yP) &&!(puIdx == 1 && (curPS == SIZE_Nx2N || curPS == SIZE_nLx2N || curPS == SIZE_nRx2N)) &&cuLeft->isInter(leftPartIdx);if (isAvailableA1)//如果左侧 PU 可用,将其mv存储在 candDir[count] 中{// get Inter DircandDir[count] = cuLeft->m_interDir[leftPartIdx];// get Mv from LeftcuLeft->getMvField(cuLeft, leftPartIdx, 0, candMvField[count][0]);if (isInterB)cuLeft->getMvField(cuLeft, leftPartIdx, 1, candMvField[count][1]);if (++count == maxNumMergeCand)return maxNumMergeCand;}//调用 deriveLeftRightTopIdx 函数,根据当前 PU 的 puIdx 和左上角部分索引 partIdxLT、右上角部分索引 partIdxRT 计算左侧、右侧和顶部的部分索引deriveLeftRightTopIdx(puIdx, partIdxLT, partIdxRT);// above 当前 PU 的上侧合并候选项uint32_t abovePartIdx = 0;const CUData* cuAbove = getPUAbove(abovePartIdx, partIdxRT);bool isAvailableB1 = cuAbove &&cuAbove->isDiffMER(xP + nPSW - 1, yP - 1, xP, yP) &&!(puIdx == 1 && (curPS == SIZE_2NxN || curPS == SIZE_2NxnU || curPS == SIZE_2NxnD)) &&cuAbove->isInter(abovePartIdx);if (isAvailableB1 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuAbove, abovePartIdx))){   //如果上侧 PU 可用,将其mv方向存储在 candDir[count] 中// get Inter DircandDir[count] = cuAbove->m_interDir[abovePartIdx];// get Mv from LeftcuAbove->getMvField(cuAbove, abovePartIdx, 0, candMvField[count][0]);if (isInterB)cuAbove->getMvField(cuAbove, abovePartIdx, 1, candMvField[count][1]);//如果候选项数量达到最大值 maxNumMergeCand,则返回 maxNumMergeCandif (++count == maxNumMergeCand)return maxNumMergeCand;}// above right 计算当前 PU 的右上侧候选项uint32_t aboveRightPartIdx = 0;const CUData* cuAboveRight = getPUAboveRight(aboveRightPartIdx, partIdxRT);bool isAvailableB0 = cuAboveRight &&cuAboveRight->isDiffMER(xP + nPSW, yP - 1, xP, yP) &&cuAboveRight->isInter(aboveRightPartIdx);if (isAvailableB0 && (!isAvailableB1 || !cuAbove->hasEqualMotion(abovePartIdx, *cuAboveRight, aboveRightPartIdx))){   //如果右上侧 PU 可用,将其合并方向存储在 candDir[count] 中// get Inter DircandDir[count] = cuAboveRight->m_interDir[aboveRightPartIdx];// get Mv from LeftcuAboveRight->getMvField(cuAboveRight, aboveRightPartIdx, 0, candMvField[count][0]);if (isInterB)cuAboveRight->getMvField(cuAboveRight, aboveRightPartIdx, 1, candMvField[count][1]);//如果候选项数量达到最大值 maxNumMergeCand,则返回 maxNumMergeCandif (++count == maxNumMergeCand)return maxNumMergeCand;}// left bottom 计算了当前 PU 的左下角合并候选项uint32_t leftBottomPartIdx = 0;const CUData* cuLeftBottom = this->getPUBelowLeft(leftBottomPartIdx, partIdxLB);bool isAvailableA0 = cuLeftBottom &&cuLeftBottom->isDiffMER(xP - 1, yP + nPSH, xP, yP) &&cuLeftBottom->isInter(leftBottomPartIdx);if (isAvailableA0 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuLeftBottom, leftBottomPartIdx))){   //如果左下角 PU 可用,将其合并方向存储在 candDir[count] 中// get Inter DircandDir[count] = cuLeftBottom->m_interDir[leftBottomPartIdx];// get Mv from LeftcuLeftBottom->getMvField(cuLeftBottom, leftBottomPartIdx, 0, candMvField[count][0]);if (isInterB)cuLeftBottom->getMvField(cuLeftBottom, leftBottomPartIdx, 1, candMvField[count][1]);if (++count == maxNumMergeCand)return maxNumMergeCand;}// above left 计算当前 PU 的左上角合并候选项if (count < 4){uint32_t aboveLeftPartIdx = 0;const CUData* cuAboveLeft = getPUAboveLeft(aboveLeftPartIdx, absPartAddr);bool isAvailableB2 = cuAboveLeft &&cuAboveLeft->isDiffMER(xP - 1, yP - 1, xP, yP) &&cuAboveLeft->isInter(aboveLeftPartIdx);if (isAvailableB2 && (!isAvailableA1 || !cuLeft->hasEqualMotion(leftPartIdx, *cuAboveLeft, aboveLeftPartIdx))&& (!isAvailableB1 || !cuAbove->hasEqualMotion(abovePartIdx, *cuAboveLeft, aboveLeftPartIdx))){   //如果左上角 PU 可用,将其合并方向存储在 candDir[count] 中// get Inter DircandDir[count] = cuAboveLeft->m_interDir[aboveLeftPartIdx];// get Mv from LeftcuAboveLeft->getMvField(cuAboveLeft, aboveLeftPartIdx, 0, candMvField[count][0]);if (isInterB)cuAboveLeft->getMvField(cuAboveLeft, aboveLeftPartIdx, 1, candMvField[count][1]);if (++count == maxNumMergeCand)return maxNumMergeCand;}}//首先检查是否启用了时域运动矢量预测(Temporal MVP)功能if (m_slice->m_sps->bTemporalMVPEnabled){   //调用 deriveRightBottomIdx 函数,根据当前 PU 的索引 puIdx 计算右下角的部分索引 partIdxRBuint32_t partIdxRB = deriveRightBottomIdx(puIdx);MV colmv;int ctuIdx = -1;// image boundary check 对图像边界进行检查,确保右下角的坐标没有超出图像边界if (m_encData->getPicCTU(m_cuAddr)->m_cuPelX + g_zscanToPelX[partIdxRB] + UNIT_SIZE < m_slice->m_sps->picWidthInLumaSamples &&m_encData->getPicCTU(m_cuAddr)->m_cuPelY + g_zscanToPelY[partIdxRB] + UNIT_SIZE < m_slice->m_sps->picHeightInLumaSamples){   //根据图像边界检查的结果,计算右下角相邻 PU 的位置uint32_t absPartIdxRB = g_zscanToRaster[partIdxRB];uint32_t numUnits = s_numPartInCUSize;bool bNotLastCol = lessThanCol(absPartIdxRB, numUnits - 1); // is not at the last column of CTUbool bNotLastRow = lessThanRow(absPartIdxRB, numUnits - 1); // is not at the last row    of CTU//如果右下角不是 CTU 的最后一列和最后一行的单元格,则将右下角相邻 PU 的绝对地址 absPartAddr 设置为当前 PU 右下角相邻 PU 的地址,并设置 ctuIdx 为当前 CTU 的地址if (bNotLastCol && bNotLastRow){absPartAddr = g_rasterToZscan[absPartIdxRB + RASTER_SIZE + 1];ctuIdx = m_cuAddr;}else if (bNotLastCol)absPartAddr = g_rasterToZscan[(absPartIdxRB + 1) & (numUnits - 1)];else if (bNotLastRow){   //如果右下角是 CTU 的最后一列但不是最后一行的单元格,则将 absPartAddr 设置为当前 PU 右侧相邻 PU 的地址,并设置 ctuIdx 为当前 CTU 的地址加一absPartAddr = g_rasterToZscan[absPartIdxRB + RASTER_SIZE - numUnits + 1];ctuIdx = m_cuAddr + 1;}else // is the right bottom corner of CTU 如果右下角是 CTU 的最后一个单元格,则将 absPartAddr 设置为零absPartAddr = 0;}//根据是否为 InterB 类型的条件,确定最大参考图像列表数 maxListint maxList = isInterB ? 2 : 1;int dir = 0, refIdx = 0;for (int list = 0; list < maxList; list++){   //判断是否存在相邻块的运动矢量bool bExistMV = ctuIdx >= 0 && getColMVP(colmv, refIdx, list, ctuIdx, absPartAddr);if (!bExistMV){uint32_t partIdxCenter = deriveCenterIdx(puIdx);bExistMV = getColMVP(colmv, refIdx, list, m_cuAddr, partIdxCenter);}if (bExistMV){   //如果相邻块的运动矢量存在,将其存储在 candMvField[count][list] 中,并设置 dir 的相应位为 1,表示该列表存在有效运动矢量dir |= (1 << list);candMvField[count][list].mv = colmv;candMvField[count][list].refIdx = refIdx;}}if (dir != 0){candDir[count] = (uint8_t)dir;if (++count == maxNumMergeCand)return maxNumMergeCand;}}if (isInterB){   //计算 cutoff,用于确定循环次数,初始化两个优先级列表 priorityList0 和 priorityList1,这些列表用于确定候选项的优先级顺序const uint32_t cutoff = count * (count - 1);uint32_t priorityList0 = 0xEDC984; // { 0, 1, 0, 2, 1, 2, 0, 3, 1, 3, 2, 3 }uint32_t priorityList1 = 0xB73621; // { 1, 0, 2, 0, 2, 1, 3, 0, 3, 1, 3, 2 }for (uint32_t idx = 0; idx < cutoff; idx++, priorityList0 >>= 2, priorityList1 >>= 2){int i = priorityList0 & 3;int j = priorityList1 & 3;if ((candDir[i] & 0x1) && (candDir[j] & 0x2)){   //如果满足条件,则从 cand[i] 和 cand[j] 获取运动矢量和参考索引// get Mv from cand[i] and cand[j]int refIdxL0 = candMvField[i][0].refIdx;int refIdxL1 = candMvField[j][1].refIdx;int refPOCL0 = m_slice->m_refPOCList[0][refIdxL0];int refPOCL1 = m_slice->m_refPOCList[1][refIdxL1];if (!(refPOCL0 == refPOCL1 && candMvField[i][0].mv == candMvField[j][1].mv)){candMvField[count][0].mv = candMvField[i][0].mv;candMvField[count][0].refIdx = refIdxL0;candMvField[count][1].mv = candMvField[j][1].mv;candMvField[count][1].refIdx = refIdxL1;candDir[count] = 3;if (++count == maxNumMergeCand)return maxNumMergeCand;}}}}int numRefIdx = (isInterB) ? X265_MIN(m_slice->m_numRefIdx[0], m_slice->m_numRefIdx[1]) : m_slice->m_numRefIdx[0];int r = 0;int refcnt = 0;while (count < maxNumMergeCand){   //将方向设为 1,表示为单向合并,将当前候选项的运动矢量设置为零candDir[count] = 1;candMvField[count][0].mv.word = 0;candMvField[count][0].refIdx = r;if (isInterB){candDir[count] = 3;candMvField[count][1].mv.word = 0;candMvField[count][1].refIdx = r;}count++;if (refcnt == numRefIdx - 1)r = 0;else{++r;++refcnt;}}return count;
}

4.运动补偿Predict::motionCompensation

运动补偿主要是根据MV运动矢量指向的参考块,构建预测块的流程,对应代码分析如下;

void Predict::motionCompensation(const CUData& cu, const PredictionUnit& pu, Yuv& predYuv, bool bLuma, bool bChroma)
{int refIdx0 = cu.m_refIdx[0][pu.puAbsPartIdx];int refIdx1 = cu.m_refIdx[1][pu.puAbsPartIdx];if (cu.m_slice->isInterP()){/* P Slice */WeightValues wv0[3];//检查参考索引 refIdx0 是否有效X265_CHECK(refIdx0 >= 0, "invalid P refidx\n");X265_CHECK(refIdx0 < cu.m_slice->m_numRefIdx[0], "P refidx out of range\n");const WeightParam *wp0 = cu.m_slice->m_weightPredTable[0][refIdx0];MV mv0 = cu.m_mv[0][pu.puAbsPartIdx];cu.clipMv(mv0);//如果启用了加权预测并且权重参数存在,则根据权重参数计算加权值if (cu.m_slice->m_pps->bUseWeightPred && wp0->wtPresent){for (int plane = 0; plane < (bChroma ? 3 : 1); plane++){   //对于亮度和色度平面,计算权重值 wv0,包括权重值 w、偏移值 offset、右移位数 shift 和舍入值 roundwv0[plane].w      = wp0[plane].inputWeight;wv0[plane].offset = wp0[plane].inputOffset * (1 << (X265_DEPTH - 8));wv0[plane].shift  = wp0[plane].log2WeightDenom;wv0[plane].round  = wp0[plane].log2WeightDenom >= 1 ? 1 << (wp0[plane].log2WeightDenom - 1) : 0;}ShortYuv& shortYuv = m_predShortYuv[0];if (bLuma)//如果需要亮度平面补偿,则调用 predInterLumaShort 函数对亮度平面进行加权预测predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);if (bChroma)//如果需要色度平面补偿,则调用 predInterChromaShort 函数对色度平面进行加权预测predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);//使用加权预测的结果,通过调用 addWeightUni 函数将加权预测结果与预测单元的 predYuv 相加addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);}else{   //如果未启用加权预测或者权重参数不存在,则执行像素级的运动补偿if (bLuma)//如果需要亮度平面补偿,则调用 predInterLumaPixel 函数对亮度平面进行像素级的运动补偿predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);if (bChroma)predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);}}else{/* B Slice */WeightValues wv0[3], wv1[3];const WeightParam *pwp0, *pwp1;X265_CHECK(refIdx0 < cu.m_slice->m_numRefIdx[0], "bidir refidx0 out of range\n");X265_CHECK(refIdx1 < cu.m_slice->m_numRefIdx[1], "bidir refidx1 out of range\n");if (cu.m_slice->m_pps->bUseWeightedBiPred){pwp0 = refIdx0 >= 0 ? cu.m_slice->m_weightPredTable[0][refIdx0] : NULL;pwp1 = refIdx1 >= 0 ? cu.m_slice->m_weightPredTable[1][refIdx1] : NULL;if (pwp0 && pwp1 && (pwp0->wtPresent || pwp1->wtPresent)){/* biprediction weighting */for (int plane = 0; plane < (bChroma ? 3 : 1); plane++){wv0[plane].w = pwp0[plane].inputWeight;wv0[plane].o = pwp0[plane].inputOffset * (1 << (X265_DEPTH - 8));wv0[plane].shift = pwp0[plane].log2WeightDenom;wv0[plane].round = 1 << pwp0[plane].log2WeightDenom;wv1[plane].w = pwp1[plane].inputWeight;wv1[plane].o = pwp1[plane].inputOffset * (1 << (X265_DEPTH - 8));wv1[plane].shift = wv0[plane].shift;wv1[plane].round = wv0[plane].round;}}else{/* uniprediction weighting, always outputs to wv0 */const WeightParam* pwp = (refIdx0 >= 0) ? pwp0 : pwp1;for (int plane = 0; plane < (bChroma ? 3 : 1); plane++){wv0[plane].w = pwp[plane].inputWeight;wv0[plane].offset = pwp[plane].inputOffset * (1 << (X265_DEPTH - 8));wv0[plane].shift = pwp[plane].log2WeightDenom;wv0[plane].round = pwp[plane].log2WeightDenom >= 1 ? 1 << (pwp[plane].log2WeightDenom - 1) : 0;}}}elsepwp0 = pwp1 = NULL;if (refIdx0 >= 0 && refIdx1 >= 0){MV mv0 = cu.m_mv[0][pu.puAbsPartIdx];MV mv1 = cu.m_mv[1][pu.puAbsPartIdx];cu.clipMv(mv0);cu.clipMv(mv1);if (bLuma){predInterLumaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);predInterLumaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);}if (bChroma){predInterChromaShort(pu, m_predShortYuv[0], *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);predInterChromaShort(pu, m_predShortYuv[1], *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);}if (pwp0 && pwp1 && (pwp0->wtPresent || pwp1->wtPresent))addWeightBi(pu, predYuv, m_predShortYuv[0], m_predShortYuv[1], wv0, wv1, bLuma, bChroma);elsepredYuv.addAvg(m_predShortYuv[0], m_predShortYuv[1], pu.puAbsPartIdx, pu.width, pu.height, bLuma, bChroma);}else if (refIdx0 >= 0){MV mv0 = cu.m_mv[0][pu.puAbsPartIdx];cu.clipMv(mv0);if (pwp0 && pwp0->wtPresent){ShortYuv& shortYuv = m_predShortYuv[0];if (bLuma)predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);if (bChroma)predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);}else{if (bLuma)predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);if (bChroma)predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[0][refIdx0], mv0);}}else{MV mv1 = cu.m_mv[1][pu.puAbsPartIdx];cu.clipMv(mv1);/* uniprediction to L1 */X265_CHECK(refIdx1 >= 0, "refidx1 was not positive\n");if (pwp1 && pwp1->wtPresent){ShortYuv& shortYuv = m_predShortYuv[0];if (bLuma)predInterLumaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);if (bChroma)predInterChromaShort(pu, shortYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);addWeightUni(pu, predYuv, shortYuv, wv0, bLuma, bChroma);}else{if (bLuma)predInterLumaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);if (bChroma)predInterChromaPixel(pu, predYuv, *cu.m_slice->m_refReconPicList[1][refIdx1], mv1);}}}
}

5.skip模式计算率失真代价Search::encodeResAndCalcRdSkipCU

用于对跳过(Skip)模式进行残差编码并计算率失真的代价,对应代码分析如下:

/* Note: this function overwrites the RD cost variables of interMode, but leaves the sa8d cost unharmed */
void Search::encodeResAndCalcRdSkipCU(Mode& interMode)
{   //函数从传入的interMode中获取相关参数,如CU数据、重建图像(reconYuv)、原始图像(fencYuv)和预测图像(predYuv)等CUData& cu = interMode.cu;Yuv* reconYuv = &interMode.reconYuv;const Yuv* fencYuv = interMode.fencYuv;Yuv* predYuv = &interMode.predYuv;X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");uint32_t depth  = cu.m_cuDepth[0];// No residual coding : SKIP mode//设置CU的预测模式为SKIP模式,并清除CBF(系数非零标志)cu.setPredModeSubParts(MODE_SKIP);cu.clearCbf();cu.setTUDepthSubParts(0, 0, depth);//将重建图像设置为预测图像的副本reconYuv->copyFromYuv(interMode.predYuv);// Luma 对亮度分量(Luma)进行失真计算,计算方法是使用SSE(平方误差和)int part = partitionFromLog2Size(cu.m_log2CUSize[0]);interMode.lumaDistortion = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);interMode.distortion = interMode.lumaDistortion;// Chroma 如果色度格式不是X265_CSP_I400(即非单色度格式),则对色度分量(Chroma)也进行失真计算,并将其添加到总失真中if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400){interMode.chromaDistortion = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));interMode.chromaDistortion += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[part].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));interMode.distortion += interMode.chromaDistortion;}//将总失真存储在CU的m_distortion数组中cu.m_distortion[0] = interMode.distortion;m_entropyCoder.load(m_rqt[depth].cur);//加载熵编码器,并重置比特数m_entropyCoder.resetBits();if (m_slice->m_pps->bTransquantBypassEnabled)//如果启用了变换绕过标志(Transquant Bypass),则编码CU的绕过标志m_entropyCoder.codeCUTransquantBypassFlag(cu.m_tqBypass[0]);m_entropyCoder.codeSkipFlag(cu, 0);//编码CU的跳过标志(skipFlag)所占的比特数int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();m_entropyCoder.codeMergeIndex(cu, 0);//编码CU的合并索引(mergeIndex),并计算跳过标志所占的比特数interMode.mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;interMode.coeffBits = 0;interMode.totalBits = interMode.mvBits + skipFlagBits;//计算运动矢量(mv)的比特数,并将其与跳过标志的比特数相加,得到总比特数if (m_rdCost.m_psyRd)//如果启用了心理视觉优化(psyRd),则计算心理能量interMode.psyEnergy = m_rdCost.psyCost(part, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);else if(m_rdCost.m_ssimRd)//如果启用了SSIM优化(ssimRd),则计算SSIM能量interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0);//计算残差能量,即原始图像与预测图像之间的SSEinterMode.resEnergy = primitives.cu[part].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);updateModeCost(interMode);//更新模式的代价(cost)m_entropyCoder.store(interMode.contexts);//存储模式的上下文(contexts)
}

6.merge模式计算率失真代价Search::encodeResAndCalcRdInterCU

用于对merge模式进行残差编码并计算率失真的代价(不仅支持merge模式的RD计算,也支持其他,除了skip模式),对应代码分析如下:

void Search::encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom)
{ProfileCUScope(interMode.cu, interRDOElapsedTime[cuGeom.depth], countInterRDO[cuGeom.depth]);//函数从传入的interMode和cuGeom中获取相关参数,如CU数据、重建图像(reconYuv)、预测图像(predYuv)、CU的深度(depth)和CU的大小的对数(log2CUSizeCUData& cu = interMode.cu;Yuv* reconYuv = &interMode.reconYuv;Yuv* predYuv = &interMode.predYuv;uint32_t depth = cuGeom.depth;ShortYuv* resiYuv = &m_rqt[depth].tmpResiYuv;const Yuv* fencYuv = interMode.fencYuv;X265_CHECK(!cu.isIntra(0), "intra CU not expected\n");uint32_t log2CUSize = cuGeom.log2CUSize;int sizeIdx = log2CUSize - 2;//使用预测图像和原始图像计算残差图像(resiYuv)resiYuv->subtract(*fencYuv, *predYuv, log2CUSize, m_frame->m_fencPic->m_picCsp);uint32_t tuDepthRange[2];cu.getInterTUQtDepthRange(tuDepthRange, 0);//获取CU的亮度变换深度范围(tuDepthRange)//加载熵编码器m_entropyCoder.load(m_rqt[depth].cur);//根据限制条件(limitTU)和邻域限制(X265_TU_LIMIT_NEIGH),选择不同的TU(变换单元)深度估计方法if ((m_limitTU & X265_TU_LIMIT_DFS) && !(m_limitTU & X265_TU_LIMIT_NEIGH))m_maxTUDepth = -1;else if (m_limitTU & X265_TU_LIMIT_BFS)memset(&m_cacheTU, 0, sizeof(TUInfoCache));Cost costs;if (m_limitTU & X265_TU_LIMIT_NEIGH){   //如果存在邻域限制,则在计算之前保存并重新加载最大TU深度(m_maxTUDepth)/* Save and reload maxTUDepth to avoid changing of maxTUDepth between modes */int32_t tempDepth = m_maxTUDepth;if (m_maxTUDepth != -1){uint32_t splitFlag = interMode.cu.m_partSize[0] != SIZE_2Nx2N;uint32_t minSize = tuDepthRange[0];uint32_t maxSize = tuDepthRange[1];maxSize = X265_MIN(maxSize, cuGeom.log2CUSize - splitFlag);m_maxTUDepth = x265_clip3(cuGeom.log2CUSize - maxSize, cuGeom.log2CUSize - minSize, (uint32_t)m_maxTUDepth);}//调用estimateResidualQT函数进行残差估计estimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);m_maxTUDepth = tempDepth;}elseestimateResidualQT(interMode, cuGeom, 0, 0, *resiYuv, costs, tuDepthRange);uint32_t tqBypass = cu.m_tqBypass[0];if (!tqBypass){   //使用原始图像和预测图像计算亮度分量(Luma)的SSE(平方误差和)sse_t cbf0Dist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400){   //如果颜色空间不是X265_CSP_I400(即非单色度格式),则还计算色度分量(Chroma)的SSE,并将其添加到总失真(cbf0Dist)中cbf0Dist += m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], predYuv->m_csize, predYuv->m_buf[1], predYuv->m_csize));cbf0Dist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], predYuv->m_csize, predYuv->m_buf[2], predYuv->m_csize));}//加载熵编码器,并重置比特数/* Consider the RD cost of not signaling any residual */m_entropyCoder.load(m_rqt[depth].cur);m_entropyCoder.resetBits();m_entropyCoder.codeQtRootCbfZero();uint32_t cbf0Bits = m_entropyCoder.getNumberOfWrittenBits();uint32_t cbf0Energy; uint64_t cbf0Cost;if (m_rdCost.m_psyRd){   //如果启用了心理视觉优化(psyRd),则计算心理能量(cbf0Energy)cbf0Energy = m_rdCost.psyCost(log2CUSize - 2, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);cbf0Cost = m_rdCost.calcPsyRdCost(cbf0Dist, cbf0Bits, cbf0Energy);}else if(m_rdCost.m_ssimRd){   //如果启用了SSIM优化(ssimRd),则计算SSIM能量(cbf0Energy)cbf0Energy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size, log2CUSize, TEXT_LUMA, 0);cbf0Cost = m_rdCost.calcSsimRdCost(cbf0Dist, cbf0Bits, cbf0Energy);}else//计算不传输任何残差的代价(cbf0Cost)cbf0Cost = m_rdCost.calcRdCost(cbf0Dist, cbf0Bits);//如果不传输残差的代价(cbf0Cost)比当前的代价(costs.rdcost)更小,则清除CBF并将变换深度设置为0if (cbf0Cost < costs.rdcost){cu.clearCbf();cu.setTUDepthSubParts(0, 0, depth);}}//如果CBF非零,则保存残差数据if (cu.getQtRootCbf(0))saveResidualQTData(cu, *resiYuv, 0, 0);//首先,加载当前变换深度(depth)对应的熵编码器/* calculate signal bits for inter/merge/skip coded CU */m_entropyCoder.load(m_rqt[depth].cur);//重置比特数m_entropyCoder.resetBits();if (m_slice->m_pps->bTransquantBypassEnabled)m_entropyCoder.codeCUTransquantBypassFlag(tqBypass);//如果CU启用了合并标志(mergeFlag)且分块大小为SIZE_2Nx2N且根节点的CBF为0,则将预测模式设置为MODE_SKIP,并对合并标志、运动矢量进行编码uint32_t coeffBits, bits, mvBits;if (cu.m_mergeFlag[0] && cu.m_partSize[0] == SIZE_2Nx2N && !cu.getQtRootCbf(0)){cu.setPredModeSubParts(MODE_SKIP);/* Merge/Skip */coeffBits = mvBits = 0;m_entropyCoder.codeSkipFlag(cu, 0);int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();m_entropyCoder.codeMergeIndex(cu, 0);mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;bits = mvBits + skipFlagBits;}else{   //否则,对跳过标志(skipFlag)、预测模式、分块大小、预测信息、系数进行编码m_entropyCoder.codeSkipFlag(cu, 0);int skipFlagBits = m_entropyCoder.getNumberOfWrittenBits();m_entropyCoder.codePredMode(cu.m_predMode[0]);m_entropyCoder.codePartSize(cu, 0, cuGeom.depth);m_entropyCoder.codePredInfo(cu, 0);mvBits = m_entropyCoder.getNumberOfWrittenBits() - skipFlagBits;bool bCodeDQP = m_slice->m_pps->bUseDQP;m_entropyCoder.codeCoeff(cu, 0, bCodeDQP, tuDepthRange);bits = m_entropyCoder.getNumberOfWrittenBits();//计算编码系数所占的比特数(coeffBits),编码运动矢量所占的比特数(mvBits),以及总的比特数(bits)coeffBits = bits - mvBits - skipFlagBits;}//使用熵编码器将编码模式的上下文(contexts)存储起来m_entropyCoder.store(interMode.contexts);//如果根节点的CBF非零,则将重建的图像(reconYuv)根据预测残差(resiYuv)进行修剪和累加,否则直接从预测图像(predYuv)复制。if (cu.getQtRootCbf(0))reconYuv->addClip(*predYuv, *resiYuv, log2CUSize, m_frame->m_fencPic->m_picCsp);elsereconYuv->copyFromYuv(*predYuv);//计算修剪后的失真和代价。首先计算亮度分量(Luma)的SSE(bestLumaDist),并更新编码模式的失真(distortion)为该值。如果颜色空间不是X265_CSP_I400(即非单色度格式),则还计算色度分量(Chroma)的SSE,并将其添加到编码模式的色度失真(chromaDistortion)和总失真(distortion)中// update with clipped distortion and cost (qp estimation loop uses unclipped values)sse_t bestLumaDist = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);interMode.distortion = bestLumaDist;if (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400){sse_t bestChromaDist = m_rdCost.scaleChromaDist(1, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[1], fencYuv->m_csize, reconYuv->m_buf[1], reconYuv->m_csize));bestChromaDist += m_rdCost.scaleChromaDist(2, primitives.chroma[m_csp].cu[sizeIdx].sse_pp(fencYuv->m_buf[2], fencYuv->m_csize, reconYuv->m_buf[2], reconYuv->m_csize));interMode.chromaDistortion = bestChromaDist;interMode.distortion += bestChromaDist;}if (m_rdCost.m_psyRd)//如果启用了心理视觉优化(psyRd),则计算心理能量(psyEnergy)interMode.psyEnergy = m_rdCost.psyCost(sizeIdx, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size);else if(m_rdCost.m_ssimRd)//如果启用了SSIM优化(ssimRd),则计算SSIM能量(ssimEnergy)interMode.ssimEnergy = m_quant.ssimDistortion(cu, fencYuv->m_buf[0], fencYuv->m_size, reconYuv->m_buf[0], reconYuv->m_size, cu.m_log2CUSize[0], TEXT_LUMA, 0);//计算预测残差的能量(resEnergy) 更新编码模式的总比特数(totalBits)、亮度失真(lumaDistortion)、系数比特数(coeffBits)、运动矢量比特数(mvBits)和CU的失真(cu.m_distortion[0])interMode.resEnergy = primitives.cu[sizeIdx].sse_pp(fencYuv->m_buf[0], fencYuv->m_size, predYuv->m_buf[0], predYuv->m_size);interMode.totalBits = bits;interMode.lumaDistortion = bestLumaDist;interMode.coeffBits = coeffBits;interMode.mvBits = mvBits;cu.m_distortion[0] = interMode.distortion;updateModeCost(interMode);//更新编码模式的代价(modeCost)checkDQP(interMode, cuGeom);//检查是否需要更新QP值(checkDQP)
}

7.帧间预测Analysis::checkInter_rd0_4

这个函数用于根据不同的复杂度指标对CU进行复杂度检查,以确定是否跳过进一步的分割。

如果recursionSkipMode等于RDCOST_BASED_RSKIP,则执行第一个分支。在这个分支中,函数计算了当前最佳模式bestMode的亮度分量的平均值和方差。它遍历bestMode.fencYuv->m_buf[0]中的像素值,计算平均值和方差。然后,它将方差与平均值的10%进行比较。如果方差小于平均值的10%,则返回true,表示当前CU的复杂度较低,可以跳过进一步的分割。否则,返回false,表示当前CU的复杂度较高,需要进行进一步的分割。

如果recursionSkipMode不等于RDCOST_BASED_RSKIP,则执行第二个分支。在这个分支中,函数计算了当前最佳模式bestMode的亮度分量的边缘方差。它使用primitives.cu[blockType].var函数计算边缘方差。然后,将边缘方差与参数edgeVarThreshold进行比较。如果边缘方差大于edgeVarThreshold,则返回false,表示当前CU的复杂度较高,需要进行进一步的分割。否则,返回true,表示当前CU的复杂度较低,可以跳过进一步的分割。

bool Analysis::complexityCheckCU(const Mode& bestMode)
{if (m_param->recursionSkipMode == RDCOST_BASED_RSKIP){uint32_t mean = 0;uint32_t homo = 0;uint32_t cuSize = bestMode.fencYuv->m_size;for (uint32_t y = 0; y < cuSize; y++) {for (uint32_t x = 0; x < cuSize; x++) {mean += (bestMode.fencYuv->m_buf[0][y * cuSize + x]);}}mean = mean / (cuSize * cuSize);for (uint32_t y = 0; y < cuSize; y++) {for (uint32_t x = 0; x < cuSize; x++) {homo += abs(int(bestMode.fencYuv->m_buf[0][y * cuSize + x] - mean));}}homo = homo / (cuSize * cuSize);if (homo < (.1 * mean))return true;return false;}else{int blockType = bestMode.cu.m_log2CUSize[0] - LOG2_UNIT_SIZE;int shift = bestMode.cu.m_log2CUSize[0] * LOG2_UNIT_SIZE;intptr_t stride = m_frame->m_fencPic->m_stride;intptr_t blockOffsetLuma = bestMode.cu.m_cuPelX + bestMode.cu.m_cuPelY * stride;uint64_t sum_ss = primitives.cu[blockType].var(m_frame->m_edgeBitPic + blockOffsetLuma, stride);uint32_t sum = (uint32_t)sum_ss;uint32_t ss = (uint32_t)(sum_ss >> 32);uint32_t pixelCount = 1 << shift;double cuEdgeVariance = (ss - ((double)sum * sum / pixelCount)) / pixelCount;if (cuEdgeVariance > (double)m_param->edgeVarThreshold)return false;elsereturn true;}}

8.帧间预测Analysis::checkInter_rd0_4

对应代码分析如下:

void Analysis::checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refMask[2])
{   //初始化帧间预测模式(interMode)的成本interMode.initCosts();interMode.cu.setPartSizeSubParts(partSize);interMode.cu.setPredModeSubParts(MODE_INTER);int numPredDir = m_slice->isInterP() ? 1 : 2;//根据切片类型确定预测方向数目if (m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10 && m_reuseInterDataCTU){int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;int index = 0;//则从已重用的数据中获取参考图像(m_reuseRef)并分配给每个子块的最佳运动估计结果(bestME)uint32_t numPU = interMode.cu.getNumPartInter(0);for (uint32_t part = 0; part < numPU; part++){MotionData* bestME = interMode.bestME[part];for (int32_t i = 0; i < numPredDir; i++)bestME[i].ref = m_reuseRef[refOffset + index++];}}if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && m_reuseInterDataCTU){uint32_t numPU = interMode.cu.getNumPartInter(0);for (uint32_t part = 0; part < numPU; part++){MotionData* bestME = interMode.bestME[part];for (int32_t i = 0; i < numPredDir; i++){   //从已重用的数据中获取参考图像索引(ref)、运动矢量(mv)和运动矢量预测索引(mvpIdx)并分配给每个子块的最佳运动估计结果(bestME)int* ref = &m_reuseRef[i * m_frame->m_analysisData.numPartitions * m_frame->m_analysisData.numCUsInFrame];bestME[i].ref = ref[cuGeom.absPartIdx];bestME[i].mv = m_reuseMv[i][cuGeom.absPartIdx].word;bestME[i].mvpIdx = m_reuseMvpIdx[i][cuGeom.absPartIdx];}}}//调用predInterSearch函数进行帧间预测搜索,得到运动矢量和相关数据,并设置参考图像掩码(refMask)predInterSearch(interMode, cuGeom, m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400), refMask);/* predInterSearch sets interMode.sa8dBits 计算预测和原始图像之间的失真*/const Yuv& fencYuv = *interMode.fencYuv;Yuv& predYuv = interMode.predYuv;int part = partitionFromLog2Size(cuGeom.log2CUSize);//对亮度分量进行SA8D(Sum of Absolute 8x8 Differences)计算,并累加到失真(distortion)中interMode.distortion = primitives.cu[part].sa8d(fencYuv.m_buf[0], fencYuv.m_size, predYuv.m_buf[0], predYuv.m_size);if (m_bChromaSa8d && (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400)){   //如果启用了色度SA8D且色度格式不是I400,则对色度分量进行SA8D计算,并累加到失真中interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[1], fencYuv.m_csize, predYuv.m_buf[1], predYuv.m_csize);interMode.distortion += primitives.chroma[m_csp].cu[part].sa8d(fencYuv.m_buf[2], fencYuv.m_csize, predYuv.m_buf[2], predYuv.m_csize);}//使用失真和比特数计算SA8D成本(sa8dCost)interMode.sa8dCost = m_rdCost.calcRdSADCost((uint32_t)interMode.distortion, interMode.sa8dBits);//如果满足条件(m_param->analysisSaveReuseLevel > 1 && m_reuseInterDataCTU),将每个子块的最佳参考索引(bestME[i].ref)保存到已重用的数据中if (m_param->analysisSaveReuseLevel > 1 && m_reuseInterDataCTU){int refOffset = cuGeom.geomRecurId * 16 * numPredDir + partSize * numPredDir * 2;int index = 0;uint32_t numPU = interMode.cu.getNumPartInter(0);for (uint32_t puIdx = 0; puIdx < numPU; puIdx++){MotionData* bestME = interMode.bestME[puIdx];for (int32_t i = 0; i < numPredDir; i++)m_reuseRef[refOffset + index++] = bestME[i].ref;}}
}

9.帧间预测搜索Search::predInterSearch

帧间预测搜索,会比较merge模式代价、单向预测模式代价和双向预测模式代价,从而选取最优模式,对应代码分析如下:

/* find the best inter prediction for each PU of specified mode */
void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t refMasks[2])
{ProfileCUScope(interMode.cu, motionEstimationElapsedTime, countMotionEstimate);CUData& cu = interMode.cu;//获取当前CU(Coding Unit)的数据结构Yuv* predYuv = &interMode.predYuv;//获取当前CU的预测YUV图像// 12 mv candidates including lowresMVMV mvc[(MD_ABOVE_LEFT + 1) * 2 + 2];const Slice *slice = m_slice;int numPart     = cu.getNumPartInter(0);//获取CU中的PU的数量int numPredDir  = slice->isInterP() ? 1 : 2;const int* numRefIdx = slice->m_numRefIdx;uint32_t lastMode = 0;int      totalmebits = 0;MV       mvzero(0, 0);Yuv&     tmpPredYuv = m_rqt[cuGeom.depth].tmpPredYuv;MergeData merge;memset(&merge, 0, sizeof(merge));bool useAsMVP = false;for (int puIdx = 0; puIdx < numPart; puIdx++){MotionData* bestME = interMode.bestME[puIdx];PredictionUnit pu(cu, cuGeom, puIdx);//设置运动估计的源图像m_me.setSourcePU(*interMode.fencYuv, pu.ctuAddr, pu.cuAbsPartIdx, pu.puAbsPartIdx, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine, bChromaMC);useAsMVP = false;x265_analysis_inter_data* interDataCTU = NULL;int cuIdx;cuIdx = (interMode.cu.m_cuAddr * m_param->num4x4Partitions) + cuGeom.absPartIdx;if (m_param->analysisLoadReuseLevel == 10 && m_param->interRefine > 1){   //如果当前CU的预测模式、分区大小等与先前分析数据匹配,则将其作为运动矢量预测的候选interDataCTU = m_frame->m_analysisData.interData;if ((cu.m_predMode[pu.puAbsPartIdx] == interDataCTU->modes[cuIdx + pu.puAbsPartIdx])&& (cu.m_partSize[pu.puAbsPartIdx] == interDataCTU->partSize[cuIdx + pu.puAbsPartIdx])&& !(interDataCTU->mergeFlag[cuIdx + puIdx])&& (cu.m_cuDepth[0] == interDataCTU->depth[cuIdx]))useAsMVP = true;}/* find best cost merge candidate. note: 2Nx2N merge and bidir are handled as separate modes */uint32_t mrgCost = numPart == 1 ? MAX_UINT : mergeEstimation(cu, cuGeom, pu, puIdx, merge);bestME[0].cost = MAX_UINT;//将最佳运动估计的成本初始化为最大值bestME[1].cost = MAX_UINT;//获取块的比特数getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits);bool bDoUnidir = true;//用于标记是否进行单向预测//获取邻居块的运动矢量cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, interMode.interNeighbours);/* Uni-directional prediction *///对于单向预测,默认配置一般不进入该分支if ((m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10)|| (m_param->analysisMultiPassRefine && m_param->rc.bStatRead) || (m_param->bAnalysisType == AVC_INFO) || (useAsMVP)){for (int list = 0; list < numPredDir; list++){//获取参考帧索引int ref = -1;if (useAsMVP)ref = interDataCTU->refIdx[list][cuIdx + puIdx];elseref = bestME[list].ref;if (ref < 0){continue;}//计算比特数uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;bits += getTUBits(ref, numRefIdx[list]);//选择最佳运动矢量预测(MVP)并获取MVP索引int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);const MV* amvp = interMode.amvpCand[list][ref];int mvpIdx = selectMVP(cu, pu, amvp, list, ref);MV mvmin, mvmax, outmv, mvp;if (useAsMVP){mvp = interDataCTU->mv[list][cuIdx + puIdx].word;mvpIdx = interDataCTU->mvpIdx[list][cuIdx + puIdx];}elsemvp = amvp[mvpIdx];if (m_param->searchMethod == X265_SEA){int puX = puIdx & 1;int puY = puIdx >> 1;for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic->m_stride;}//设置搜索范围setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);MV mvpIn = mvp;int satdCost;if (m_param->analysisMultiPassRefine && m_param->rc.bStatRead && mvpIdx == bestME[list].mvpIdx)mvpIn = bestME[list].mv;if (useAsMVP && m_param->mvRefine > 1){MV bestmv, mvpSel[3];int mvpIdxSel[3];satdCost = m_me.COST_MAX;mvpSel[0] = mvp;mvpIdxSel[0] = mvpIdx;mvpIdx = selectMVP(cu, pu, amvp, list, ref);mvpSel[1] = interMode.amvpCand[list][ref][mvpIdx];mvpIdxSel[1] = mvpIdx;if (m_param->mvRefine > 2){mvpSel[2] = interMode.amvpCand[list][ref][!mvpIdx];mvpIdxSel[2] = !mvpIdx;}for (int cand = 0; cand < m_param->mvRefine; cand++){if (cand && (mvpSel[cand] == mvpSel[cand - 1] || (cand == 2 && mvpSel[cand] == mvpSel[cand - 2])))continue;//设置搜索范围setSearchRange(cu, mvpSel[cand], m_param->searchRange, mvmin, mvmax);int bcost = m_me.motionEstimate(&m_slice->m_mref[list][ref], mvmin, mvmax, mvpSel[cand], numMvc, mvc, m_param->searchRange, bestmv, m_param->maxSlices,m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);if (satdCost > bcost){satdCost = bcost;outmv = bestmv;mvp = mvpSel[cand];mvpIdx = mvpIdxSel[cand];}}mvpIn = mvp;}else{satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvpIn, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices,m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);}/* Get total cost of partition, but only include MV bit cost once */bits += m_me.bitcost(outmv);uint32_t mvCost = m_me.mvcost(outmv);uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);/* Refine MVP selection, updates: mvpIdx, bits, cost */if (!(m_param->analysisMultiPassRefine || useAsMVP))mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);else{/* It is more accurate to compare with actual mvp that was used in motionestimate than amvp[mvpIdx]. Here the actual mvp is bestME from pass 1 for that mvpIdx */int diffBits = m_me.bitcost(outmv, amvp[!mvpIdx]) - m_me.bitcost(outmv, mvpIn);if (diffBits < 0){mvpIdx = !mvpIdx;uint32_t origOutBits = bits;bits = origOutBits + diffBits;cost = (cost - m_rdCost.getCost(origOutBits)) + m_rdCost.getCost(bits);}mvp = amvp[mvpIdx];}if (cost < bestME[list].cost){bestME[list].mv = outmv;bestME[list].mvp = mvp;bestME[list].mvpIdx = mvpIdx;bestME[list].cost = cost;bestME[list].bits = bits;bestME[list].mvCost  = mvCost;bestME[list].ref = ref;}bDoUnidir = false;}            }else if (m_param->bDistributeMotionEstimation){PME pme(*this, interMode, cuGeom, pu, puIdx);pme.m_jobTotal = 0;pme.m_jobAcquired = 1; /* reserve L0-0 or L1-0 */uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;for (int list = 0; list < numPredDir; list++){int idx = 0;for (int ref = 0; ref < numRefIdx[list]; ref++){if (!(refMask & (1 << ref)))continue;pme.m_jobs.ref[list][idx++]  = ref;pme.m_jobTotal++;}pme.m_jobs.refCnt[list] = idx;/* the second list ref bits start at bit 16 */refMask >>= 16;}if (pme.m_jobTotal > 2){pme.tryBondPeers(*m_frame->m_encData->m_jobProvider, pme.m_jobTotal - 1);processPME(pme, *this);int ref = pme.m_jobs.refCnt[0] ? pme.m_jobs.ref[0][0] : pme.m_jobs.ref[1][0];singleMotionEstimation(*this, interMode, pu, puIdx, 0, ref); /* L0-0 or L1-0 */bDoUnidir = false;ProfileCUScopeNamed(pmeWaitScope, interMode.cu, pmeBlockTime, countPMEMasters);pme.waitForExit();}/* if no peer threads were bonded, fall back to doing unidirectional* searches ourselves without overhead of singleMotionEstimation() */}if (bDoUnidir)//单向运动估计{interMode.bestME[puIdx][0].ref = interMode.bestME[puIdx][1].ref = -1;uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1;for (int list = 0; list < numPredDir; list++){for (int ref = 0; ref < numRefIdx[list]; ref++){ProfileCounter(interMode.cu, totalMotionReferences[cuGeom.depth]);//根据参考掩码refMask和当前参考图像的索引ref,判断是否需要跳过当前参考图像的估计过程。如果需要跳过,则继续下一个参考图像的估计if (!(refMask & (1 << ref))){ProfileCounter(interMode.cu, skippedMotionReferences[cuGeom.depth]);continue;}//接着,计算一些比特数,并根据参考图像、预测方向和其他参数获取运动矢量候选列表uint32_t bits = m_listSelBits[list] + MVP_IDX_BITS;bits += getTUBits(ref, numRefIdx[list]);int numMvc = cu.getPMV(interMode.interNeighbours, list, ref, interMode.amvpCand[list][ref], mvc);//选择一个最佳的运动矢量预测(MVP)作为初始的MVP,并进行一些相关计算和操作const MV* amvp = interMode.amvpCand[list][ref];int mvpIdx = selectMVP(cu, pu, amvp, list, ref);MV mvmin, mvmax, outmv, mvp = amvp[mvpIdx], mvp_lowres;bool bLowresMVP = false;if (!m_param->analysisSave && !m_param->analysisLoad) /* Prevents load/save outputs from diverging when lowresMV is not available */{MV lmv = getLowresMV(cu, pu, list, ref);if (lmv.notZero())mvc[numMvc++] = lmv;if (m_param->bEnableHME)mvp_lowres = lmv;}if (m_param->searchMethod == X265_SEA){int puX = puIdx & 1;int puY = puIdx >> 1;for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)m_me.integral[planes] = interMode.fencYuv->m_integral[list][ref][planes] + puX * pu.width + puY * pu.height * m_slice->m_refFrameList[list][ref]->m_reconPic->m_stride;}//根据搜索范围和其他参数,使用运动估计算法估计当前参考图像的最佳运动矢量,并计算其相关的代价setSearchRange(cu, mvp, m_param->searchRange, mvmin, mvmax);int satdCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp, numMvc, mvc, m_param->searchRange, outmv, m_param->maxSlices, m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);//如果允许低分辨率的MVP,并且低分辨率的MVP的代价小于当前参考图像的代价,则选择低分辨率的MVP,并更新相关的代价和比特数if (m_param->bEnableHME && mvp_lowres.notZero() && mvp_lowres != mvp){MV outmv_lowres;setSearchRange(cu, mvp_lowres, m_param->searchRange, mvmin, mvmax);int lowresMvCost = m_me.motionEstimate(&slice->m_mref[list][ref], mvmin, mvmax, mvp_lowres, numMvc, mvc, m_param->searchRange, outmv_lowres, m_param->maxSlices,m_param->bSourceReferenceEstimation ? m_slice->m_refFrameList[list][ref]->m_fencPic->getLumaAddr(0) : 0);if (lowresMvCost < satdCost){outmv = outmv_lowres;satdCost = lowresMvCost;bLowresMVP = true;}}//根据运动矢量的比特数和代价,计算总的代价/* Get total cost of partition, but only include MV bit cost once */bits += m_me.bitcost(outmv);uint32_t mvCost = m_me.mvcost(outmv);uint32_t cost = (satdCost - mvCost) + m_rdCost.getCost(bits);/* Update LowresMVP to best AMVP cand 更新最佳运动矢量预测(MVP)以及相关的代价和比特数*/if (bLowresMVP)updateMVP(amvp[mvpIdx], outmv, bits, cost, mvp_lowres);/* Refine MVP selection, updates: mvpIdx, bits, cost */mvp = checkBestMVP(amvp, outmv, mvpIdx, bits, cost);//如果当前参考图像的代价小于当前预测方向的最佳代价,则更新最佳运动矢量和相关信息if (cost < bestME[list].cost){bestME[list].mv      = outmv;bestME[list].mvp     = mvp;bestME[list].mvpIdx  = mvpIdx;bestME[list].ref     = ref;bestME[list].cost    = cost;bestME[list].bits    = bits;bestME[list].mvCost  = mvCost;}}/* the second list ref bits start at bit 16 */refMask >>= 16;}}/* Bi-directional prediction */MotionData bidir[2];uint32_t bidirCost = MAX_UINT;int bidirBits = 0;//它检查是否适用于双向预测的情况,包括当前CU是否为Inter-B模式,是否允许使用双向预测,以及当前PU的分割大小是否为2Nx2N(2Nx2N的双向预测在其他地方处理)if (slice->isInterB() && !cu.isBipredRestriction() &&  /* biprediction is possible for this PU */cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N &&    /* 2Nx2N biprediction is handled elsewhere */bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT){   //代码将获取两个最佳的单向预测结果(bestME[0]和bestME[1]),并将它们存储在bidir[0]和bidir[1]中bidir[0] = bestME[0];bidir[1] = bestME[1];int satdCost;if (m_me.bChromaSATD){cu.m_mv[0][pu.puAbsPartIdx] = bidir[0].mv;cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;cu.m_mv[1][pu.puAbsPartIdx] = bidir[1].mv;cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;motionCompensation(cu, pu, tmpPredYuv, true, true);satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);}else{PicYuv* refPic0 = slice->m_refReconPicList[0][bestME[0].ref];PicYuv* refPic1 = slice->m_refReconPicList[1][bestME[1].ref];Yuv* bidirYuv = m_rqt[cuGeom.depth].bidirPredYuv;/* Generate reference subpels */predInterLumaPixel(pu, bidirYuv[0], *refPic0, bestME[0].mv);predInterLumaPixel(pu, bidirYuv[1], *refPic1, bestME[1].mv);primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (bidirYuv[0].m_size % 64 == 0) && (bidirYuv[1].m_size % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, bidirYuv[0].getLumaAddr(pu.puAbsPartIdx), bidirYuv[0].m_size,bidirYuv[1].getLumaAddr(pu.puAbsPartIdx), bidirYuv[1].m_size, 32);satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);}//计算双向预测所需的比特数,并将其存储在bidirBits中。然后,将代价与比特数结合使用rdCost.getCost函数计算总代价,并将其存储在bidirCost中bidirBits = bestME[0].bits + bestME[1].bits + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);bidirCost = satdCost + m_rdCost.getCost(bidirBits);//代码接着检查是否尝试使用零运动矢量进行预测。如果bestME[0]和bestME[1]的运动矢量不为零,则尝试使用零运动矢量进行预测bool bTryZero = bestME[0].mv.notZero() || bestME[1].mv.notZero();if (bTryZero){   //在这种情况下,代码会进行进一步的计算和比较,包括计算零运动矢量的代价、比特数,并进行MVP(运动矢量预测)的选择和更新/* Do not try zero MV if unidir motion predictors are beyond* valid search area */MV mvmin, mvmax;int merange = X265_MAX(m_param->sourceWidth, m_param->sourceHeight);setSearchRange(cu, mvzero, merange, mvmin, mvmax);mvmax.y += 2; // there is some pad for subpel refinemvmin <<= 2;mvmax <<= 2;bTryZero &= bestME[0].mvp.checkRange(mvmin, mvmax);bTryZero &= bestME[1].mvp.checkRange(mvmin, mvmax);}if (bTryZero){/* coincident blocks of the two reference pictures */if (m_me.bChromaSATD){cu.m_mv[0][pu.puAbsPartIdx] = mvzero;cu.m_refIdx[0][pu.puAbsPartIdx] = (int8_t)bidir[0].ref;cu.m_mv[1][pu.puAbsPartIdx] = mvzero;cu.m_refIdx[1][pu.puAbsPartIdx] = (int8_t)bidir[1].ref;motionCompensation(cu, pu, tmpPredYuv, true, true);satdCost = m_me.bufSATD(tmpPredYuv.getLumaAddr(pu.puAbsPartIdx), tmpPredYuv.m_size) +m_me.bufChromaSATD(tmpPredYuv, pu.puAbsPartIdx);}else{const pixel* ref0 = m_slice->m_mref[0][bestME[0].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);const pixel* ref1 = m_slice->m_mref[1][bestME[1].ref].getLumaAddr(pu.ctuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx);intptr_t refStride = slice->m_mref[0][0].lumaStride;primitives.pu[m_me.partEnum].pixelavg_pp[(tmpPredYuv.m_size % 64 == 0) && (refStride % 64 == 0)](tmpPredYuv.m_buf[0], tmpPredYuv.m_size, ref0, refStride, ref1, refStride, 32);satdCost = m_me.bufSATD(tmpPredYuv.m_buf[0], tmpPredYuv.m_size);}MV mvp0 = bestME[0].mvp;int mvpIdx0 = bestME[0].mvpIdx;uint32_t bits0 = bestME[0].bits - m_me.bitcost(bestME[0].mv, mvp0) + m_me.bitcost(mvzero, mvp0);MV mvp1 = bestME[1].mvp;int mvpIdx1 = bestME[1].mvpIdx;uint32_t bits1 = bestME[1].bits - m_me.bitcost(bestME[1].mv, mvp1) + m_me.bitcost(mvzero, mvp1);uint32_t cost = satdCost + m_rdCost.getCost(bits0) + m_rdCost.getCost(bits1);/* refine MVP selection for zero mv, updates: mvp, mvpidx, bits, cost */mvp0 = checkBestMVP(interMode.amvpCand[0][bestME[0].ref], mvzero, mvpIdx0, bits0, cost);mvp1 = checkBestMVP(interMode.amvpCand[1][bestME[1].ref], mvzero, mvpIdx1, bits1, cost);//代码比较使用零运动矢量和双向预测运动矢量的代价,并选择代价较小的作为最终的双向预测结果。最终的双向预测结果存储在bidir数组中,而相关的比特数和代价存储在bidirBits和bidirCost中if (cost < bidirCost){bidir[0].mv = mvzero;bidir[1].mv = mvzero;bidir[0].mvp = mvp0;bidir[1].mvp = mvp1;bidir[0].mvpIdx = mvpIdx0;bidir[1].mvpIdx = mvpIdx1;bidirCost = cost;bidirBits = bits0 + bits1 + m_listSelBits[2] - (m_listSelBits[0] + m_listSelBits[1]);}}}//这段代码的目的是选择最佳的预测模式,并将其存储在CU(Coding Unit,编码单元)中/* select best option and store into CU */if (mrgCost < bidirCost && mrgCost < bestME[0].cost && mrgCost < bestME[1].cost){//比较了合并模式(merge mode)的代价(mrgCost)与双向预测(bidirCost)和单向预测(bestME[0].cost和bestME[1].cost)的代价。如果合并模式的代价最小,那么选择合并模式作为最佳模式cu.m_mergeFlag[pu.puAbsPartIdx] = true;cu.m_mvpIdx[0][pu.puAbsPartIdx] = merge.index; /* merge candidate ID is stored in L0 MVP idx */cu.setPUInterDir(merge.dir, pu.puAbsPartIdx, puIdx);cu.setPUMv(0, merge.mvField[0].mv, pu.puAbsPartIdx, puIdx);cu.setPURefIdx(0, merge.mvField[0].refIdx, pu.puAbsPartIdx, puIdx);cu.setPUMv(1, merge.mvField[1].mv, pu.puAbsPartIdx, puIdx);cu.setPURefIdx(1, merge.mvField[1].refIdx, pu.puAbsPartIdx, puIdx);totalmebits += merge.bits;}//如果双向预测的代价最小else if (bidirCost < bestME[0].cost && bidirCost < bestME[1].cost){lastMode = 2;cu.m_mergeFlag[pu.puAbsPartIdx] = false;cu.setPUInterDir(3, pu.puAbsPartIdx, puIdx);cu.setPUMv(0, bidir[0].mv, pu.puAbsPartIdx, puIdx);cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);cu.m_mvd[0][pu.puAbsPartIdx] = bidir[0].mv - bidir[0].mvp;cu.m_mvpIdx[0][pu.puAbsPartIdx] = bidir[0].mvpIdx;cu.setPUMv(1, bidir[1].mv, pu.puAbsPartIdx, puIdx);cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);cu.m_mvd[1][pu.puAbsPartIdx] = bidir[1].mv - bidir[1].mvp;cu.m_mvpIdx[1][pu.puAbsPartIdx] = bidir[1].mvpIdx;totalmebits += bidirBits;}//如果双向预测的代价不是最小的,而单向预测的代价最小,则分别选择单向预测方向0和1作为最佳模式else if (bestME[0].cost <= bestME[1].cost){lastMode = 0;cu.m_mergeFlag[pu.puAbsPartIdx] = false;cu.setPUInterDir(1, pu.puAbsPartIdx, puIdx);cu.setPUMv(0, bestME[0].mv, pu.puAbsPartIdx, puIdx);cu.setPURefIdx(0, bestME[0].ref, pu.puAbsPartIdx, puIdx);cu.m_mvd[0][pu.puAbsPartIdx] = bestME[0].mv - bestME[0].mvp;cu.m_mvpIdx[0][pu.puAbsPartIdx] = bestME[0].mvpIdx;cu.setPURefIdx(1, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);cu.setPUMv(1, mvzero, pu.puAbsPartIdx, puIdx);totalmebits += bestME[0].bits;}else{lastMode = 1;cu.m_mergeFlag[pu.puAbsPartIdx] = false;cu.setPUInterDir(2, pu.puAbsPartIdx, puIdx);cu.setPUMv(1, bestME[1].mv, pu.puAbsPartIdx, puIdx);cu.setPURefIdx(1, bestME[1].ref, pu.puAbsPartIdx, puIdx);cu.m_mvd[1][pu.puAbsPartIdx] = bestME[1].mv - bestME[1].mvp;cu.m_mvpIdx[1][pu.puAbsPartIdx] = bestME[1].mvpIdx;cu.setPURefIdx(0, REF_NOT_VALID, pu.puAbsPartIdx, puIdx);cu.setPUMv(0, mvzero, pu.puAbsPartIdx, puIdx);totalmebits += bestME[1].bits;}motionCompensation(cu, pu, *predYuv, true, bChromaMC);}//最后一行代码将总的运动估计比特数(totalmebits)累加到interMode.sa8dBits中interMode.sa8dBits += totalmebits;
}

10.运动估计MotionEstimate::motionEstimate

实现AMVP过程,以候选最优mv为起点,进行先进行二分之一亚像素运动估计,再进行四分之一亚像素运动估计,对应代码分析如下:

int MotionEstimate::motionEstimate(ReferencePlanes *ref,const MV &       mvmin,const MV &       mvmax,const MV &       qmvp,int              numCandidates,const MV *       mvc,int              merange,MV &             outQMv,uint32_t         maxSlices,pixel *          srcReferencePlane)
{   //根据一些参数和条件进行一些初始化操作,包括确定是否使用低分辨率参考图像、计算图像偏移、获取参考图像和当前图像的指针等ALIGN_VAR_16(int, costs[16]);bool hme = srcReferencePlane && srcReferencePlane == ref->fpelLowerResPlane[0];if (ctuAddr >= 0)blockOffset = ref->reconPic->getLumaAddr(ctuAddr, absPartIdx) - ref->reconPic->getLumaAddr(0);intptr_t stride = hme ? ref->lumaStride / 2 : ref->lumaStride;pixel* fenc = fencPUYuv.m_buf[0];pixel* fref = srcReferencePlane == 0 ? ref->fpelPlane[0] + blockOffset : srcReferencePlane + blockOffset;//设置初始的运动矢量预测(MVP)为给定的qmvpsetMVP(qmvp);//放大4倍,转换为1/4像素的向量MV qmvmin = mvmin.toQPel();MV qmvmax = mvmax.toQPel();/* The term cost used here means satd/sad values for that particular search.* The costs used in ME integer search only includes the SAD cost of motion* residual and sqrtLambda times MVD bits.  The subpel refine steps use SATD* cost of residual and sqrtLambda * MVD bits.  Mode decision will be based* on video distortion cost (SSE/PSNR) plus lambda times all signaling bits* (mode + MVD bits). */// measure SAD cost at clipped QPEL MVP 根据给定的mvmin和mvmax对qmvp进行裁剪,得到pmvMV pmv = qmvp.clipped(qmvmin, qmvmax);MV bestpre = pmv;int bprecost;//并计算pmv该运动矢量的初始代价bprecostif (ref->isLowres)//如果参考图像为低分辨率图像,则使用低分辨率QPel代价函数进行计算,否则使用亚像素比较函数进行计算bprecost = ref->lowresQPelCost(fenc, blockOffset, pmv, sad, hme);elsebprecost = subpelCompare(ref, pmv, sad);/* re-measure full pel rounded MVP with SAD as search start point */MV bmv = pmv.roundToFPel();int bcost = bprecost;if (pmv.isSubpel())bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) + mvcost(bmv << 2);//如果pmv不为零,则测量运动矢量为零时的代价,并与初始代价进行比较,选择代价较小的作为最佳运动矢量// measure SAD cost at MV(0) if MVP is not zeroif (pmv.notZero()){int cost = sad(fenc, FENC_STRIDE, fref, stride) + mvcost(MV(0, 0));if (cost < bcost){bcost = cost;bmv = 0;bmv.y = X265_MAX(X265_MIN(0, mvmax.y), mvmin.y);}}X265_CHECK(!(ref->isLowres && numCandidates), "lowres motion candidates not allowed\n")// measure SAD cost at each QPEL motion vector candidatefor (int i = 0; i < numCandidates; i++)//对于每个给定的运动矢量候选项mvc{   //将候选项裁剪到mvmin和mvmax的范围内MV m = mvc[i].clipped(qmvmin, qmvmax);if (m.notZero() & (m != pmv ? 1 : 0) & (m != bestpre ? 1 : 0)) // check already measured{//如果候选项不为零且不等于pmv和bestpre(避免重复测量),则使用亚像素比较函数计算该候选项的代价,并与当前最佳代价进行比较,更新最佳运动矢量和代价int cost = subpelCompare(ref, m, sad) + mvcost(m);if (cost < bprecost){bprecost = cost;bestpre = m;}}}//将pmv取整到QPel,并将bmv初始化为pmvpmv = pmv.roundToFPel();MV omv = bmv;  // current search origin or starting pointint search = ref->isHMELowres ? (hme ? searchMethodL0 : searchMethodL1) : searchMethod;switch (search){case X265_DIA_SEARCH://这里使用的是钻石搜索方法(DIAMOND_SEARCH){/* diamond search, radius 1 */bcost <<= 4;//设置初始的搜索代价为bmv的代价的16倍int i = merange;do{   //进行一系列搜索步骤,每步选择当前位置周围的四个方向进行搜索,并更新最佳代价和最佳运动矢COST_MV_X4_DIR(0, -1, 0, 1, -1, 0, 1, 0, costs);if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))COPY1_IF_LT(bcost, (costs[0] << 4) + 1);if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))COPY1_IF_LT(bcost, (costs[1] << 4) + 3);COPY1_IF_LT(bcost, (costs[2] << 4) + 4);COPY1_IF_LT(bcost, (costs[3] << 4) + 12);if (!(bcost & 15))break;bmv.x -= (bcost << 28) >> 30;bmv.y -= (bcost << 30) >> 30;bcost &= ~15;}while (--i && bmv.checkRange(mvmin, mvmax));//搜索步骤的次数由merange确定,且在满足条件的情况下进行搜索bcost >>= 4;//最终将搜索代价右移4位,得到最终的搜索代价break;}case X265_HEX_SEARCH:{
me_hex2:/* hexagon search, radius 2 */
#if 0for (int i = 0; i < merange / 2; i++){omv = bmv;COST_MV(omv.x - 2, omv.y);COST_MV(omv.x - 1, omv.y + 2);COST_MV(omv.x + 1, omv.y + 2);COST_MV(omv.x + 2, omv.y);COST_MV(omv.x + 1, omv.y - 2);COST_MV(omv.x - 1, omv.y - 2);if (omv == bmv)break;if (!bmv.checkRange(mvmin, mvmax))break;}#else // if 0 用于执行六边形和方形搜索过程/* equivalent to the above, but eliminates duplicate candidates */COST_MV_X3_DIR(-2, 0, -1, 2,  1, 2, costs);bcost <<= 3;//根据当前的运动矢量bmv和一些预定义的偏移量进行搜索。搜索过程按照特定的顺序,依次计算候选运动矢量的代价,并与当前最佳代价进行比较。如果候选运动矢量在允许的范围内且代价较小,则更新最佳运动矢量和代价。搜索过程中,根据当前的方向选择相应的偏移量,并在每次搜索后更新方向和运动矢量if ((bmv.y >= mvmin.y) & (bmv.y <= mvmax.y))COPY1_IF_LT(bcost, (costs[0] << 3) + 2);if ((bmv.y + 2 >= mvmin.y) & (bmv.y + 2 <= mvmax.y)){COPY1_IF_LT(bcost, (costs[1] << 3) + 3);COPY1_IF_LT(bcost, (costs[2] << 3) + 4);}COST_MV_X3_DIR(2, 0,  1, -2, -1, -2, costs);if ((bmv.y >= mvmin.y) & (bmv.y <= mvmax.y))COPY1_IF_LT(bcost, (costs[0] << 3) + 5);if ((bmv.y - 2 >= mvmin.y) & (bmv.y - 2 <= mvmax.y)){COPY1_IF_LT(bcost, (costs[1] << 3) + 6);COPY1_IF_LT(bcost, (costs[2] << 3) + 7);}if (bcost & 7){int dir = (bcost & 7) - 2;if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y)){bmv += hex2[dir + 1];//如果检测到最优点不是当前点,则按照半六边形方式往最优点方向遍历/* half hexagon, not overlapping the previous iteration */for (int i = (merange >> 1) - 1; i > 0 && bmv.checkRange(mvmin, mvmax); i--){COST_MV_X3_DIR(hex2[dir + 0].x, hex2[dir + 0].y,hex2[dir + 1].x, hex2[dir + 1].y,hex2[dir + 2].x, hex2[dir + 2].y,costs);bcost &= ~7;if ((bmv.y + hex2[dir + 0].y >= mvmin.y) & (bmv.y + hex2[dir + 0].y <= mvmax.y))COPY1_IF_LT(bcost, (costs[0] << 3) + 1);if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))COPY1_IF_LT(bcost, (costs[1] << 3) + 2);if ((bmv.y + hex2[dir + 2].y >= mvmin.y) & (bmv.y + hex2[dir + 2].y <= mvmax.y))COPY1_IF_LT(bcost, (costs[2] << 3) + 3);if (!(bcost & 7))break;dir += (bcost & 7) - 2;dir = mod6m1[dir + 1];bmv += hex2[dir + 1];}} // if ((bmv.y + hex2[dir + 1].y >= mvmin.y) & (bmv.y + hex2[dir + 1].y <= mvmax.y))}bcost >>= 3;
#endif // if 0//方形搜索将当前最佳运动矢量作为中心,按照固定的偏移量进行搜索。类似于六边形搜索,方形搜索也计算候选运动矢量的代价,并与当前最佳代价进行比较。如果候选运动矢量在允许的范围内且代价较小,则更新最佳运动矢量和代价/* square refine */int dir = 0;COST_MV_X4_DIR(0, -1,  0, 1, -1, 0, 1, 0, costs);if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))COPY2_IF_LT(bcost, costs[0], dir, 1);if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))COPY2_IF_LT(bcost, costs[1], dir, 2);COPY2_IF_LT(bcost, costs[2], dir, 3);COPY2_IF_LT(bcost, costs[3], dir, 4);COST_MV_X4_DIR(-1, -1, -1, 1, 1, -1, 1, 1, costs);if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))COPY2_IF_LT(bcost, costs[0], dir, 5);if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))COPY2_IF_LT(bcost, costs[1], dir, 6);if ((bmv.y - 1 >= mvmin.y) & (bmv.y - 1 <= mvmax.y))COPY2_IF_LT(bcost, costs[2], dir, 7);if ((bmv.y + 1 >= mvmin.y) & (bmv.y + 1 <= mvmax.y))COPY2_IF_LT(bcost, costs[3], dir, 8);bmv += square1[dir];break;}case X265_UMH_SEARCH:{int ucost1, ucost2;int16_t cross_start = 1;/* refine predictors */omv = bmv;ucost1 = bcost;X265_CHECK(((pmv.y >= mvmin.y) & (pmv.y <= mvmax.y)), "pmv outside of search range!");DIA1_ITER(pmv.x, pmv.y);if (pmv.notZero())DIA1_ITER(0, 0);ucost2 = bcost;if (bmv.notZero() && bmv != pmv)DIA1_ITER(bmv.x, bmv.y);if (bcost == ucost2)cross_start = 3;/* Early Termination */omv = bmv;if (bcost == ucost2 && SAD_THRESH(2000)){COST_MV_X4(0, -2, -1, -1, 1, -1, -2, 0);COST_MV_X4(2, 0, -1, 1, 1, 1,  0, 2);if (bcost == ucost1 && SAD_THRESH(500))break;if (bcost == ucost2){int16_t range = (int16_t)(merange >> 1) | 1;CROSS(3, range, range);COST_MV_X4(-1, -2, 1, -2, -2, -1, 2, -1);COST_MV_X4(-2, 1, 2, 1, -1, 2, 1, 2);if (bcost == ucost2)break;cross_start = range + 2;}}// TODO: Need to study x264's logic for building mvc list to understand why they//       have special cases here for 16x16, and whether they apply to HEVC CTU// adaptive search range based on mvc variabilityif (numCandidates){/* range multipliers based on casual inspection of some statistics of* average distance between current predictor and final mv found by ESA.* these have not been tuned much by actual encoding. */static const uint8_t range_mul[4][4] ={{ 3, 3, 4, 4 },{ 3, 4, 4, 4 },{ 4, 4, 4, 5 },{ 4, 4, 5, 6 },};int mvd;int sad_ctx, mvd_ctx;int denom = 1;if (numCandidates == 1){if (LUMA_64x64 == partEnum)/* mvc is probably the same as mvp, so the difference isn't meaningful.* but prediction usually isn't too bad, so just use medium range */mvd = 25;elsemvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);}else{/* calculate the degree of agreement between predictors. *//* in 64x64, mvc includes all the neighbors used to make mvp,* so don't count mvp separately. */denom = numCandidates - 1;mvd = 0;if (partEnum != LUMA_64x64){mvd = abs(qmvp.x - mvc[0].x) + abs(qmvp.y - mvc[0].y);denom++;}mvd += predictorDifference(mvc, numCandidates);}sad_ctx = SAD_THRESH(1000) ? 0: SAD_THRESH(2000) ? 1: SAD_THRESH(4000) ? 2 : 3;mvd_ctx = mvd < 10 * denom ? 0: mvd < 20 * denom ? 1: mvd < 40 * denom ? 2 : 3;merange = (merange * range_mul[mvd_ctx][sad_ctx]) >> 2;}/* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy.* we are still centered on the same place as the DIA2. is this desirable? */CROSS(cross_start, merange, merange >> 1);COST_MV_X4(-2, -2, -2, 2, 2, -2, 2, 2);/* hexagon grid */omv = bmv;const uint16_t *p_cost_omvx = m_cost_mvx + omv.x * 4;const uint16_t *p_cost_omvy = m_cost_mvy + omv.y * 4;uint16_t i = 1;do{if (4 * i > X265_MIN4(mvmax.x - omv.x, omv.x - mvmin.x,mvmax.y - omv.y, omv.y - mvmin.y)){for (int j = 0; j < 16; j++){MV mv = omv + (hex4[j] * i);if (mv.checkRange(mvmin, mvmax))COST_MV(mv.x, mv.y);}}else{int16_t dir = 0;pixel *fref_base = fref + omv.x + (omv.y - 4 * i) * stride;size_t dy = (size_t)i * stride;
#define SADS(k, x0, y0, x1, y1, x2, y2, x3, y3) \sad_x4(fenc, \fref_base x0 * i + (y0 - 2 * k + 4) * dy, \fref_base x1 * i + (y1 - 2 * k + 4) * dy, \fref_base x2 * i + (y2 - 2 * k + 4) * dy, \fref_base x3 * i + (y3 - 2 * k + 4) * dy, \stride, costs + 4 * k); \fref_base += 2 * dy;
#define ADD_MVCOST(k, x, y) costs[k] += p_cost_omvx[x * 4 * i] + p_cost_omvy[y * 4 * i]
#define MIN_MV(k, dx, dy)     if ((omv.y + (dy) >= mvmin.y) & (omv.y + (dy) <= mvmax.y)) { COPY2_IF_LT(bcost, costs[k], dir, dx * 16 + (dy & 15)) }SADS(0, +0, -4, +0, +4, -2, -3, +2, -3);SADS(1, -4, -2, +4, -2, -4, -1, +4, -1);SADS(2, -4, +0, +4, +0, -4, +1, +4, +1);SADS(3, -4, +2, +4, +2, -2, +3, +2, +3);ADD_MVCOST(0, 0, -4);ADD_MVCOST(1, 0, 4);ADD_MVCOST(2, -2, -3);ADD_MVCOST(3, 2, -3);ADD_MVCOST(4, -4, -2);ADD_MVCOST(5, 4, -2);ADD_MVCOST(6, -4, -1);ADD_MVCOST(7, 4, -1);ADD_MVCOST(8, -4, 0);ADD_MVCOST(9, 4, 0);ADD_MVCOST(10, -4, 1);ADD_MVCOST(11, 4, 1);ADD_MVCOST(12, -4, 2);ADD_MVCOST(13, 4, 2);ADD_MVCOST(14, -2, 3);ADD_MVCOST(15, 2, 3);MIN_MV(0, 0, -4);MIN_MV(1, 0, 4);MIN_MV(2, -2, -3);MIN_MV(3, 2, -3);MIN_MV(4, -4, -2);MIN_MV(5, 4, -2);MIN_MV(6, -4, -1);MIN_MV(7, 4, -1);MIN_MV(8, -4, 0);MIN_MV(9, 4, 0);MIN_MV(10, -4, 1);MIN_MV(11, 4, 1);MIN_MV(12, -4, 2);MIN_MV(13, 4, 2);MIN_MV(14, -2, 3);MIN_MV(15, 2, 3);
#undef SADS
#undef ADD_MVCOST
#undef MIN_MVif (dir){bmv.x = omv.x + i * (dir >> 4);bmv.y = omv.y + i * ((dir << 28) >> 28);}}}while (++i <= merange >> 2);if (bmv.checkRange(mvmin, mvmax))goto me_hex2;break;}case X265_STAR_SEARCH: // Adapted from HM ME{int bPointNr = 0;int bDistance = 0;const int EarlyExitIters = 3;StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, EarlyExitIters, merange, hme);if (bDistance == 1){// if best distance was only 1, check two missing points.  If no new point is found, stopif (bPointNr){/* For a given direction 1 to 8, check nearest two outer X pixelsX   XX 1 2 3 X4 * 5X 6 7 8 XX   X*/int saved = bcost;const MV mv1 = bmv + offsets[(bPointNr - 1) * 2];const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1];if (mv1.checkRange(mvmin, mvmax)){COST_MV(mv1.x, mv1.y);}if (mv2.checkRange(mvmin, mvmax)){COST_MV(mv2.x, mv2.y);}if (bcost == saved)break;}elsebreak;}const int RasterDistance = 5;if (bDistance > RasterDistance){// raster search refinement if original search distance was too bigMV tmv;for (tmv.y = mvmin.y; tmv.y <= mvmax.y; tmv.y += RasterDistance){for (tmv.x = mvmin.x; tmv.x <= mvmax.x; tmv.x += RasterDistance){if (tmv.x + (RasterDistance * 3) <= mvmax.x){pixel *pix_base = fref + tmv.y * stride + tmv.x;sad_x4(fenc,pix_base,pix_base + RasterDistance,pix_base + RasterDistance * 2,pix_base + RasterDistance * 3,stride, costs);costs[0] += mvcost(tmv << 2);COPY2_IF_LT(bcost, costs[0], bmv, tmv);tmv.x += RasterDistance;costs[1] += mvcost(tmv << 2);COPY2_IF_LT(bcost, costs[1], bmv, tmv);tmv.x += RasterDistance;costs[2] += mvcost(tmv << 2);COPY2_IF_LT(bcost, costs[2], bmv, tmv);tmv.x += RasterDistance;costs[3] += mvcost(tmv << 3);COPY2_IF_LT(bcost, costs[3], bmv, tmv);}elseCOST_MV(tmv.x, tmv.y);}}}while (bDistance > 0){// center a new search around current bestbDistance = 0;bPointNr = 0;const int MaxIters = 32;StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, MaxIters, merange, hme);if (bDistance == 1){if (!bPointNr)break;/* For a given direction 1 to 8, check nearest 2 outer X pixelsX   XX 1 2 3 X4 * 5X 6 7 8 XX   X*/const MV mv1 = bmv + offsets[(bPointNr - 1) * 2];const MV mv2 = bmv + offsets[(bPointNr - 1) * 2 + 1];if (mv1.checkRange(mvmin, mvmax)){COST_MV(mv1.x, mv1.y);}if (mv2.checkRange(mvmin, mvmax)){COST_MV(mv2.x, mv2.y);}break;}}break;}case X265_SEA:{// Successive Elimination Algorithmconst int32_t minX = X265_MAX(omv.x - (int32_t)merange, mvmin.x);const int32_t minY = X265_MAX(omv.y - (int32_t)merange, mvmin.y);const int32_t maxX = X265_MIN(omv.x + (int32_t)merange, mvmax.x);const int32_t maxY = X265_MIN(omv.y + (int32_t)merange, mvmax.y);const uint16_t *p_cost_mvx = m_cost_mvx - qmvp.x;const uint16_t *p_cost_mvy = m_cost_mvy - qmvp.y;int16_t* meScratchBuffer = NULL;int scratchSize = merange * 2 + 4;if (scratchSize){meScratchBuffer = X265_MALLOC(int16_t, scratchSize);memset(meScratchBuffer, 0, sizeof(int16_t)* scratchSize);}/* SEA is fastest in multiples of 4 */int meRangeWidth = (maxX - minX + 3) & ~3;int w = 0, h = 0;                    // Width and height of the PUALIGN_VAR_32(pixel, zero[64 * FENC_STRIDE]) = { 0 };ALIGN_VAR_32(int, encDC[4]);uint16_t *fpelCostMvX = m_fpelMvCosts[-qmvp.x & 3] + (-qmvp.x >> 2);sizesFromPartition(partEnum, &w, &h);int deltaX = (w <= 8) ? (w) : (w >> 1);int deltaY = (h <= 8) ? (h) : (h >> 1);/* Check if very small rectangular blocks which cannot be sub-divided anymore */bool smallRectPartition = partEnum == LUMA_4x4 || partEnum == LUMA_16x12 ||partEnum == LUMA_12x16 || partEnum == LUMA_16x4 || partEnum == LUMA_4x16;/* Check if vertical partition */bool verticalRect = partEnum == LUMA_32x64 || partEnum == LUMA_16x32 || partEnum == LUMA_8x16 ||partEnum == LUMA_4x8;/* Check if horizontal partition */bool horizontalRect = partEnum == LUMA_64x32 || partEnum == LUMA_32x16 || partEnum == LUMA_16x8 ||partEnum == LUMA_8x4;/* Check if assymetric vertical partition */bool assymetricVertical = partEnum == LUMA_12x16 || partEnum == LUMA_4x16 || partEnum == LUMA_24x32 ||partEnum == LUMA_8x32 || partEnum == LUMA_48x64 || partEnum == LUMA_16x64;/* Check if assymetric horizontal partition */bool assymetricHorizontal = partEnum == LUMA_16x12 || partEnum == LUMA_16x4 || partEnum == LUMA_32x24 ||partEnum == LUMA_32x8 || partEnum == LUMA_64x48 || partEnum == LUMA_64x16;int tempPartEnum = 0;/* If a vertical rectangular partition, it is horizontally split into two, for ads_x2() */if (verticalRect)tempPartEnum = partitionFromSizes(w, h >> 1);/* If a horizontal rectangular partition, it is vertically split into two, for ads_x2() */else if (horizontalRect)tempPartEnum = partitionFromSizes(w >> 1, h);/* We have integral planes introduced to account for assymetric partitions.* Hence all assymetric partitions except those which cannot be split into legal sizes,* are split into four for ads_x4() */else if (assymetricVertical || assymetricHorizontal)tempPartEnum = smallRectPartition ? partEnum : partitionFromSizes(w >> 1, h >> 1);/* General case: Square partitions. All partitions with width > 8 are split into four* for ads_x4(), for 4x4 and 8x8 we do ads_x1() */elsetempPartEnum = (w <= 8) ? partEnum : partitionFromSizes(w >> 1, h >> 1);/* Successive elimination by comparing DC before a full SAD,* because sum(abs(diff)) >= abs(diff(sum)). */primitives.pu[tempPartEnum].sad_x4(zero,fenc,fenc + deltaX,fenc + deltaY * FENC_STRIDE,fenc + deltaX + deltaY * FENC_STRIDE,FENC_STRIDE,encDC);/* Assigning appropriate integral plane */uint32_t *sumsBase = NULL;switch (deltaX){case 32: if (deltaY % 24 == 0)sumsBase = integral[1];else if (deltaY == 8)sumsBase = integral[2];elsesumsBase = integral[0];break;case 24: sumsBase = integral[3];break;case 16: if (deltaY % 12 == 0)sumsBase = integral[5];else if (deltaY == 4)sumsBase = integral[6];elsesumsBase = integral[4];break;case 12: sumsBase = integral[7];break;case 8: if (deltaY == 32)sumsBase = integral[8];elsesumsBase = integral[9];break;case 4: if (deltaY == 16)sumsBase = integral[10];elsesumsBase = integral[11];break;default: sumsBase = integral[11];break;}if (partEnum == LUMA_64x64 || partEnum == LUMA_32x32 || partEnum == LUMA_16x16 ||partEnum == LUMA_32x64 || partEnum == LUMA_16x32 || partEnum == LUMA_8x16 ||partEnum == LUMA_4x8 || partEnum == LUMA_12x16 || partEnum == LUMA_4x16 ||partEnum == LUMA_24x32 || partEnum == LUMA_8x32 || partEnum == LUMA_48x64 ||partEnum == LUMA_16x64)deltaY *= (int)stride;if (verticalRect)encDC[1] = encDC[2];if (horizontalRect)deltaY = deltaX;/* ADS and SAD */MV tmv;for (tmv.y = minY; tmv.y <= maxY; tmv.y++){int i, xn;int ycost = p_cost_mvy[tmv.y] << 2;if (bcost <= ycost)continue;bcost -= ycost;/* ADS_4 for 16x16, 32x32, 64x64, 24x32, 32x24, 48x64, 64x48, 32x8, 8x32, 64x16, 16x64 partitions* ADS_1 for 4x4, 8x8, 16x4, 4x16, 16x12, 12x16 partitions* ADS_2 for all other rectangular partitions */xn = ads(encDC,sumsBase + minX + tmv.y * stride,deltaY,fpelCostMvX + minX,meScratchBuffer,meRangeWidth,bcost);for (i = 0; i < xn - 2; i += 3)COST_MV_X3_ABS(minX + meScratchBuffer[i], tmv.y,minX + meScratchBuffer[i + 1], tmv.y,minX + meScratchBuffer[i + 2], tmv.y);bcost += ycost;for (; i < xn; i++)COST_MV(minX + meScratchBuffer[i], tmv.y);}if (meScratchBuffer)x265_free(meScratchBuffer);break;}case X265_FULL_SEARCH:{// dead slow exhaustive search, but at least it uses sad_x4()MV tmv;int32_t mvmin_y = mvmin.y, mvmin_x = mvmin.x, mvmax_y = mvmax.y, mvmax_x = mvmax.x;if (ref->isHMELowres){merange = (merange < 0 ? -merange : merange);mvmin_y = X265_MAX(mvmin.y, -merange);mvmin_x = X265_MAX(mvmin.x, -merange);mvmax_y = X265_MIN(mvmax.y, merange);mvmax_x = X265_MIN(mvmax.x, merange);}for (tmv.y = mvmin_y; tmv.y <= mvmax_y; tmv.y++){for (tmv.x = mvmin_x; tmv.x <= mvmax_x; tmv.x++){if (tmv.x + 3 <= mvmax_x){pixel *pix_base = fref + tmv.y * stride + tmv.x;sad_x4(fenc,pix_base,pix_base + 1,pix_base + 2,pix_base + 3,stride, costs);costs[0] += mvcost(tmv << 2);COPY2_IF_LT(bcost, costs[0], bmv, tmv);tmv.x++;costs[1] += mvcost(tmv << 2);COPY2_IF_LT(bcost, costs[1], bmv, tmv);tmv.x++;costs[2] += mvcost(tmv << 2);COPY2_IF_LT(bcost, costs[2], bmv, tmv);tmv.x++;costs[3] += mvcost(tmv << 2);COPY2_IF_LT(bcost, costs[3], bmv, tmv);}elseCOST_MV(tmv.x, tmv.y);}}break;}default:X265_CHECK(0, "invalid motion estimate mode\n");break;}//如果之前的预测代价(bprecost)小于当前的代价(bcost),则将之前的预测运动矢量(bestpre)作为最佳运动矢量,并更新最佳代价(bcost)为预测代价(bprecost)if (bprecost < bcost){bmv = bestpre;bcost = bprecost;}else//否则,将当前的运动矢量(bmv)提升为四分之一像素(qpel)精度。这是为了进一步细化运动矢量的精度,以获得更准确的代价估计。bmv = bmv.toQPel(); // promote search bmv to qpelconst SubpelWorkload& wl = workload[this->subpelRefine];//根据最大切片数(maxSlices)和运动矢量的y分量(bmv.y)与允许范围的比较,检查运动矢量是否超出了切片的边界。如果超出了范围,则将运动矢量限制在范围内,并重新计算代价。这是为了确保运动矢量在切片边界内,以避免跨切片的运动估计// check mv range for slice boundif ((maxSlices > 1) & ((bmv.y < qmvmin.y) | (bmv.y > qmvmax.y))){bmv.y = x265_min(x265_max(bmv.y, qmvmin.y), qmvmax.y);bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);}//如果最终的代价(bcost)为零,表示在剪裁后的预测运动矢量处没有残差。在这种情况下,可以跳过亚像素细化过程,但仍需考虑运动矢量的代价(mvcost)。if (!bcost){/* if there was zero residual at the clipped MVP, we can skip subpel* refine, but we do need to include the mvcost in the returned cost */bcost = mvcost(bmv);}else if (ref->isLowres){   //如果参考图像为低分辨率(isLowres为真),则进行低分辨率亚像素细化过程。int bdir = 0;for (int i = 1; i <= wl.hpel_dirs; i++){   //首先,通过对当前运动矢量(bmv)进行四分之一像素偏移(square1[i] * 2)来获取候选运动矢量(qmv)MV qmv = bmv + square1[i] * 2;//然后,检查候选运动矢量是否在允许范围内,如果不在范围内则跳过。计算候选运动矢量的代价(cost),并与当前最佳代价(bcost)进行比较,更新最佳运动矢量和代价/* skip invalid range */if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))continue;//根据选择的最佳方向(bdir),更新当前运动矢量(bmv)和代价(bcost)int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad, hme) + mvcost(qmv);COPY2_IF_LT(bcost, cost, bdir, i);}//通过对当前运动矢量(bmv)进行二分之一像素偏移(square1[bdir])来更新运动矢量bmv += square1[bdir] * 2;bcost = ref->lowresQPelCost(fenc, blockOffset, bmv, satd, hme) + mvcost(bmv);bdir = 0;for (int i = 1; i <= wl.qpel_dirs; i++){MV qmv = bmv + square1[i];/* skip invalid range */if ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))continue;int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, satd, hme) + mvcost(qmv);COPY2_IF_LT(bcost, cost, bdir, i);}bmv += square1[bdir];}else{   //对之前的运动矢量进行亚像素级别的优化搜索pixelcmp_t hpelcomp;//根据亚像素级别的(wl.hpel_satd)进行判断if (wl.hpel_satd){bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);hpelcomp = satd;}elsehpelcomp = sad;for (int iter = 0; iter < wl.hpel_iters; iter++){int bdir = 0;for (int i = 1; i <= wl.hpel_dirs; i++){   //在每次迭代中,对当前运动矢量(bmv)进行二分之一像素偏移(square1[i] * 2)来获取候选运动矢量(qmv)MV qmv = bmv + square1[i] * 2;//检查候选运动矢量是否在允许范围内,如果超出了范围则跳过// check mv range for slice boundif ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))continue;//与当前最佳代价进行比较。更新最佳运动矢量和代价的方式与之前的代码类似int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv);COPY2_IF_LT(bcost, cost, bdir, i);}if (bdir)bmv += square1[bdir] * 2;elsebreak;}//亚像素级别的优化搜索分为两个阶段:HPEL(二分之一像素)和QPEL(四分之一像素)/* if HPEL search used SAD, remeasure with SATD before QPEL */if (!wl.hpel_satd)bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv);for (int iter = 0; iter < wl.qpel_iters; iter++){   //接着进行QPEL搜索,迭代进行,直到没有更好的方向为止int bdir = 0;for (int i = 1; i <= wl.qpel_dirs; i++){MV qmv = bmv + square1[i];// check mv range for slice boundif ((qmv.y < qmvmin.y) | (qmv.y > qmvmax.y))continue;int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv);COPY2_IF_LT(bcost, cost, bdir, i);}if (bdir)bmv += square1[bdir];elsebreak;}}// check mv range for slice boundX265_CHECK(((bmv.y >= qmvmin.y) & (bmv.y <= qmvmax.y)), "mv beyond range!");x265_emms();outQMv = bmv;return bcost;
}

点赞、收藏,会是我继续写作的动力!赠人玫瑰,手有余香。



本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部