OpenGL.Shader:志哥教你寫一個濾鏡直播用戶端(11)
一、優化的必要性
為何要再談高斯濾波?因為高斯濾波是在實際中應用廣泛,而且可能會用到比較大的卷積核(譬如說[5x5] [7x7] [9x9],注意都是奇數大小)。此時如果再使用之前介紹的簡單高斯濾波實作,gpu顯存會增大,此時程式性能就不太令人滿意了。文章背景就是如此,那如何優化?利用卷積可分離性就是解決這種問題的一種思路。配合OpenGL.Shader的多着色器結合變能有效的把優化得以實作。
接下來先簡單介紹什麼是卷積可分離性。 假設A為列向量,B為行向量,則有A*B=B*A。例如n=3時,如下圖所示:
根據以上理論,前面說到的高斯核Kernel2D可以了解為KernelX·KernelY,那麼卷積過程就可以表示為:Dst=Src*Kernel2D=(Src*KernelX)*KernelY=(Src*KernelY)*KernelX。一般來說,無論對于何種卷積濾波,隻要其卷積核可以拆解為兩個行向量與列向量的乘積,那麼就算卷積可分離。 此過程也可以了解成,把二維卷積核降到一維進行處理。
二、利用FBO實作卷積核降維
道理大家都懂了,那麼該如何實作其降維優化的邏輯呢?我們可以利用FBO離屏渲染,先處理Src*KernelX的邏輯部分,把結果儲存到FBO1,再以FBO1為輸入與KernelY再次運算得出最終輸出。GPUImage當中也是通過這裡一思路實作其他複雜的卷積優化,和一些濾鏡合并的操作,其仿照核心實作類GpuBaseFilterGroup.hpp,代碼如下:
#ifndef GPU_FILTER_GROUP_HPP
#define GPU_FILTER_GROUP_HPP
#include <list>
#include <vector>
#include "GpuBaseFilter.hpp"
class GpuBaseFilterGroup : public GpuBaseFilter {
// GpuBaseFilter virtual method
public:
GpuBaseFilterGroup()
{
mFBO_IDs = NULL;
mFBO_TextureIDs = NULL;
}
virtual void onOutputSizeChanged(int width, int height) {
if (mFilterList.empty()) return;
destroyFrameBufferObjs();
std::vector<GpuBaseFilter>::iterator itr;
for(itr=mFilterList.begin(); itr!=mFilterList.end(); itr++)
{
GpuBaseFilter filter = *itr;
filter.onOutputSizeChanged(width, height);
}
createFrameBufferObjs(width, height);
}
virtual void destroy() {
destroyFrameBufferObjs();
std::vector<GpuBaseFilter>::iterator itr;
for(itr=mFilterList.begin(); itr!=mFilterList.end(); itr++)
{
GpuBaseFilter filter = *itr;
filter.destroy();
}
mFilterList.clear();
GpuBaseFilter::destroy();
}
private:
void createFrameBufferObjs(int _width, int _height ) {
const int num = mFilterList.size() -1;
// 最後一次draw是在顯示螢幕上
mFBO_IDs = new GLuint[num];
mFBO_TextureIDs = new GLuint[num];
glGenFramebuffers(num, mFBO_IDs);
glGenTextures(num, mFBO_TextureIDs);
for (int i = 0; i < num; i++) {
glBindTexture(GL_TEXTURE_2D, mFBO_TextureIDs[i]);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); // GL_REPEAT
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); // GL_REPEAT
glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA16F, _width, _height, 0, GL_RGBA, GL_FLOAT, 0);
glBindFramebuffer(GL_FRAMEBUFFER, mFBO_IDs[i]);
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, mFBO_TextureIDs[i], 0);
glBindTexture(GL_TEXTURE_2D, 0);
glBindFramebuffer(GL_FRAMEBUFFER, 0);
}
}
void destroyFrameBufferObjs() {
if (mFBO_TextureIDs != NULL) {
glDeleteTextures(length(mFBO_TextureIDs), mFBO_TextureIDs);
delete[] mFBO_TextureIDs;
mFBO_TextureIDs = NULL;
}
if (mFBO_IDs != NULL) {
glDeleteFramebuffers(length(mFBO_IDs), mFBO_IDs);
delete[] mFBO_IDs;
mFBO_IDs = NULL;
}
}
inline int length(GLuint arr[]) {
return sizeof(arr) / sizeof(arr[0]);
}
public:
std::vector<GpuBaseFilter> mFilterList;
void addFilter(GpuBaseFilter filter) {
mFilterList.push_back(filter);
}
GLuint* mFBO_IDs;
GLuint* mFBO_TextureIDs;
};
#endif // GPU_FILTER_GROUP_HPP
代碼調理清晰,内容易懂,雖然它是繼承GpuBaseFilter,其實是為了相容之前父類引用的做法。然後再覆寫這個基礎方法。關鍵是在onOutputSizeChanged當中,我們單獨拿來說說:
virtual void onOutputSizeChanged(int width, int height) {
if (mFilterList.empty()) return;
// 檢查Filter清單是否為空,為空沒必要繼續下去
destroyFrameBufferObjs();
// 銷毀以後的fbo及其綁定紋理的緩存清單
std::vector<GpuBaseFilter>::iterator itr;
for(itr=mFilterList.begin(); itr!=mFilterList.end(); itr++)
{
GpuBaseFilter filter = *itr;
filter.onOutputSizeChanged(width, height);
}
// 激活所有filter的sizechanged方法
createFrameBufferObjs(width, height);
// 建立對應的fbo及其綁定紋理
}
接着是私有方法createFrameBufferObjs,具體代碼參照上方,注意一點的是,建立的數量是mFilterList.size() -1,因為最後一次的最終輸出圖像是渲染到螢幕上。
三、GpuGaussianBlurFilter2
最後一步,也是最複雜的一步,改造我們的高斯濾鏡的實作。先看着色器代碼
attribute vec4 position;
attribute vec4 inputTextureCoordinate;
const int GAUSSIAN_SAMPLES = 9;
uniform float widthFactor;
uniform float heightFactor;
varying vec2 blurCoordinates[GAUSSIAN_SAMPLES];
void main()
{
gl_Position = position;
vec2 singleStepOffset = vec2(widthFactor, heightFactor);
int multiplier = 0;
vec2 blurStep;
for (int i = 0; i < GAUSSIAN_SAMPLES; i++)
{
multiplier = (i - ((GAUSSIAN_SAMPLES - 1) / 2));
//-4,-3,-2,-1,0,1,2,3,4
blurStep = float(multiplier) * singleStepOffset;
blurCoordinates[i] = inputTextureCoordinate.xy + blurStep;
}
}
以目前頂點為中心,輸出9個采樣點的坐标位置, 其中初讀代碼singleStepOffset可能會有疑問,以往我們都把寬度因子和高度因子都一并傳入,這樣9個采樣點的位置就是成45°對角傾斜。這裡先不做解釋,下面會有詳細解讀。
uniform sampler2D SamplerY;
uniform sampler2D SamplerU;
uniform sampler2D SamplerV;
uniform sampler2D SamplerRGB;
mat3 colorConversionMatrix = mat3(
1.0, 1.0, 1.0,
0.0, -0.39465, 2.03211,
1.13983, -0.58060, 0.0);
vec3 yuv2rgb(vec2 pos)
{
vec3 yuv;
yuv.x = texture2D(SamplerY, pos).r;
yuv.y = texture2D(SamplerU, pos).r - 0.5;
yuv.z = texture2D(SamplerV, pos).r - 0.5;
return colorConversionMatrix * yuv;
}
uniform int drawMode; //0為YUV,1為RGB
const int GAUSSIAN_SAMPLES = 9;
varying vec2 blurCoordinates[GAUSSIAN_SAMPLES];
void main()
{
vec3 fragmentColor = vec3(0.0);
if (drawMode==0)
{
fragmentColor += (yuv2rgb(blurCoordinates[0]) *0.05);
fragmentColor += (yuv2rgb(blurCoordinates[1]) *0.09);
fragmentColor += (yuv2rgb(blurCoordinates[2]) *0.12);
fragmentColor += (yuv2rgb(blurCoordinates[3]) *0.15);
fragmentColor += (yuv2rgb(blurCoordinates[4]) *0.18);
fragmentColor += (yuv2rgb(blurCoordinates[5]) *0.15);
fragmentColor += (yuv2rgb(blurCoordinates[6]) *0.12);
fragmentColor += (yuv2rgb(blurCoordinates[7]) *0.09);
fragmentColor += (yuv2rgb(blurCoordinates[8]) *0.05);
gl_FragColor = vec4(fragmentColor, 1.0);
}
else
{
fragmentColor += (texture2D(SamplerRGB, blurCoordinates[0]).rgb *0.05);
fragmentColor += (texture2D(SamplerRGB, blurCoordinates[1]).rgb *0.09);
fragmentColor += (texture2D(SamplerRGB, blurCoordinates[2]).rgb *0.12);
fragmentColor += (texture2D(SamplerRGB, blurCoordinates[3]).rgb *0.15);
fragmentColor += (texture2D(SamplerRGB, blurCoordinates[4]).rgb *0.18);
fragmentColor += (texture2D(SamplerRGB, blurCoordinates[5]).rgb *0.15);
fragmentColor += (texture2D(SamplerRGB, blurCoordinates[6]).rgb *0.12);
fragmentColor += (texture2D(SamplerRGB, blurCoordinates[7]).rgb *0.09);
fragmentColor += (texture2D(SamplerRGB, blurCoordinates[8]).rgb *0.05);
gl_FragColor = vec4(fragmentColor, 1.0);
}
}
看着很複雜,又yuv又rgb的,其實最初GpuBaseFilter的設計就是相容兩種模式,之前是我偷懶沒寫全。這裡寫全了實際的内容很簡單,根據9個坐标點進行紋理色值的采樣,然後進行卷積運算。高斯核也剩了,簡化為了9個權值系數,值得注意的是這9個系數不是亂定義的,是根據通用的高斯公式生成,而且都已經歸一化,9個系數相加總數是等于一!
那麼為啥我這裡不能偷懶隻寫一種drawMode,頂點着色器的singleStepOffset造成頂點坐标的45°傾斜又如何了解?接着走下去。
class GpuGaussianBlurFilter2 : public GpuBaseFilterGroup {
GpuGaussianBlurFilter2()
{
GAUSSIAN_BLUR_VERTEX_SHADER = "...";
GAUSSIAN_BLUR_FRAGMENT_SHADER = "..."; //上方代碼
}
~GpuGaussianBlurFilter2()
{
if(!GAUSSIAN_BLUR_VERTEX_SHADER.empty()) GAUSSIAN_BLUR_VERTEX_SHADER.clear();
if(!GAUSSIAN_BLUR_FRAGMENT_SHADER.empty()) GAUSSIAN_BLUR_FRAGMENT_SHADER.clear();
}
void init() {
GpuBaseFilter filter1;
filter1.init(GAUSSIAN_BLUR_VERTEX_SHADER.c_str(), GAUSSIAN_BLUR_FRAGMENT_SHADER.c_str());
mWidthFactorLocation1 = glGetUniformLocation(filter1.getProgram(), "widthFactor");
mHeightFactorLocation1 = glGetUniformLocation(filter1.getProgram(), "heightFactor");
mDrawModeLocation1 = glGetUniformLocation(filter1.getProgram(), "drawMode");
addFilter(filter1);
GpuBaseFilter filter2;
filter2.init(GAUSSIAN_BLUR_VERTEX_SHADER.c_str(), GAUSSIAN_BLUR_FRAGMENT_SHADER.c_str());
mWidthFactorLocation2 = glGetUniformLocation(filter2.getProgram(), "widthFactor");
mHeightFactorLocation2 = glGetUniformLocation(filter2.getProgram(), "heightFactor");
mDrawModeLocation2 = glGetUniformLocation(filter2.getProgram(), "drawMode");
addFilter(filter2);
}
... ...
}
再看覆寫父類(GpuBaseFilterGroup)的父類(GpuBaseFilter)的無參的方法init(),友善統一管理和代碼引用。内容不難,就是建立兩個着色器對象,都是用了同一套着色器,但是Shader的對象引用注意區分。
class GpuGaussianBlurFilter2 : public GpuBaseFilterGroup {
... ... 接上
public:
void onOutputSizeChanged(int width, int height) {
GpuBaseFilterGroup::onOutputSizeChanged(width, height);
}
void setAdjustEffect(float percent) {
mSampleOffset = range(percent * 100.0f, 0.0f, 2.0f);
}
}
接着還是覆寫父類(GpuBaseFilterGroup)的父類(GpuBaseFilter)的無參的方法onOutputSizeChanged,不需要做特殊的處理,直接使用父類GpuBaseFilterGroup的代碼邏輯(内容見上方目錄二)
class GpuGaussianBlurFilter2 : public GpuBaseFilterGroup {
... ... 接上
public:
void onDraw(GLuint SamplerY_texId, GLuint SamplerU_texId, GLuint SamplerV_texId,
void* positionCords, void* textureCords)
{
if (mFilterList.size()==0) return;
GLuint previousTexture = 0;
int size = mFilterList.size();
for (int i = 0; i < size; i++) {
GpuBaseFilter filter = mFilterList[i];
bool isNotLast = i < size - 1;
if (isNotLast) {
glBindFramebuffer(GL_FRAMEBUFFER, mFBO_IDs[i]);
}
glClearColor(0, 0, 0, 0);
if (i == 0) {
drawFilter1YUV(filter, SamplerY_texId, SamplerU_texId, SamplerV_texId, positionCords, textureCords);
}
if (i == 1) { //isNotLast=false, not bind FBO, draw on screen.
drawFilter2RGB(filter, previousTexture, positionCords, mNormalTextureCords);
}
if (isNotLast) {
glBindFramebuffer(GL_FRAMEBUFFER, 0);
previousTexture = mFBO_TextureIDs[i];
}
}
}
}
到重點渲染方法onDraw,覆寫爺爺類GpuBaseFilter,是所有濾鏡接口的通用方法。代碼邏輯參照GPUImage進行簡化。額,感覺沒啥好說,因為這個onDraw方法是GpuGaussianBlurFilter2的具體實作,并不具有沒有通用性,按照(目錄一)高斯濾鏡的優化實作邏輯就可以了。
i==0時,先進行src*kernelX的離屏渲染。進入具體檢視drawFilter1YUV的内容。
class GpuGaussianBlurFilter2 : public GpuBaseFilterGroup {
... ... 接上
private:
void drawFilter1YUV(GpuBaseFilter filter,
GLuint SamplerY_texId, GLuint SamplerU_texId, GLuint SamplerV_texId,
void* positionCords, void* textureCords)
{
if (!filter.isInitialized())
return;
glUseProgram(filter.getProgram());
glUniform1i(mDrawModeLocation1, 0);
//glUniform1f(mSampleOffsetLocation1, mSampleOffset);
glUniform1f(mWidthFactorLocation1, mSampleOffset / filter.mOutputWidth);
glUniform1f(mHeightFactorLocation1, 0);
glVertexAttribPointer(filter.mGLAttribPosition, 2, GL_FLOAT, GL_FALSE, 0, positionCords);
glEnableVertexAttribArray(filter.mGLAttribPosition);
glVertexAttribPointer(filter.mGLAttribTextureCoordinate, 2, GL_FLOAT, GL_FALSE, 0, textureCords);
glEnableVertexAttribArray(filter.mGLAttribTextureCoordinate);
glActiveTexture(GL_TEXTURE0);
glBindTexture(GL_TEXTURE_2D, SamplerY_texId);
glUniform1i(filter.mGLUniformSampleY, 0);
glActiveTexture(GL_TEXTURE1);
glBindTexture(GL_TEXTURE_2D, SamplerU_texId);
glUniform1i(filter.mGLUniformSampleU, 1);
glActiveTexture(GL_TEXTURE2);
glBindTexture(GL_TEXTURE_2D, SamplerV_texId);
glUniform1i(filter.mGLUniformSampleV, 2);
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
glDisableVertexAttribArray(filter.mGLAttribPosition);
glDisableVertexAttribArray(filter.mGLAttribTextureCoordinate);
glBindTexture(GL_TEXTURE_2D, 0);
}
}
注意看着哦,所有的通用Shader應用對象索引,都是指定傳入的filter,隻有三個特殊對象索引特殊處理。i==0時,先進行src*kernelX的離屏渲染。第一次我們是從視訊原生資料yuv進行圖像采樣,是以要使用drawMode=0,即YUV模式。然後,widthFactor傳入 SampleOffset / 螢幕寬度 作為頂點着色器的寬度因子。但是!!!heightFactor傳入0!即目前縱向不進行偏移,是以頂點着色器就不會出現45°階梯式的采樣偏移了,此時完成了 src*kernalX的離屏渲染。
趁熱打鐵,當i==1,這一次是循環的最後一次,不再需要離屏渲染,是直接輸出到螢幕。注意previousTexture緩存了i==0離屏渲染的所綁定的紋理id,其承載着i==0即src*kernelX的渲染結果。我們以此為輸入,進行drawFilter2RGB
class GpuGaussianBlurFilter2 : public GpuBaseFilterGroup {
... ... 接上
private:
void drawFilter2RGB(GpuBaseFilter filter, GLuint _texId, void* positionCords, void* textureCords)
{
if (!filter.isInitialized())
return;
glUseProgram(filter.getProgram());
glUniform1i(mDrawModeLocation2, 1);
//glUniform1f(mSampleOffsetLocation2, mSampleOffset);
glUniform1f(mWidthFactorLocation2, 0);
glUniform1f(mHeightFactorLocation2, mSampleOffset / filter.mOutputHeight);
glVertexAttribPointer(filter.mGLAttribPosition, 2, GL_FLOAT, GL_FALSE, 0, positionCords);
glEnableVertexAttribArray(filter.mGLAttribPosition);
glVertexAttribPointer(filter.mGLAttribTextureCoordinate, 2, GL_FLOAT, GL_FALSE, 0, textureCords);
glEnableVertexAttribArray(filter.mGLAttribTextureCoordinate);
glActiveTexture(GL_TEXTURE3);
glBindTexture(GL_TEXTURE_2D, _texId);
glUniform1i(filter.mGLUniformSampleRGB, 3);
glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
glDisableVertexAttribArray(filter.mGLAttribPosition);
glDisableVertexAttribArray(filter.mGLAttribTextureCoordinate);
glBindTexture(GL_TEXTURE_2D, 0);
}
}
還是那三個特殊處理的shader引用索引。此時傳入的是rgb紋理,drawMode使用rgb模式1,然後這一次widthFactor為0,heightFactor傳入mSampleOffset / 螢幕高度,完成最後一步的 (src*kernelX)*kernelY。
四、總結
最終測試可以在GpuFilterRender::checkFilterChange,把GpuGaussianBlurFilter的引用替換成GpuGaussianBlurFilter2。大家可以對比效果可以發現2的實作比1要更明顯,那是因為GpuGaussianBlurFilter隻是一個簡單的3x3高斯核,GpuGaussianBlurFilter2是9x9的運算結果。雖然看着2比1的運算量要大,但是再看看GPU的顯存情況,降低了差不多一半,性能得到明顯改善。
此篇不單隻是get到卷積核的降維優化實作,還get到了多shader的分級渲染方法,這是不是可以考慮考慮多濾鏡的組合效果實作呢 ?代碼同步至:https://github.com/MrZhaozhirong/NativeCppApp /src/main/cpp/gpufilter/filter/GpuGaussianBlurFilter2.hpp