Silicon & Lithium: 2014

Sunday, 11 May 2014

MSVC C99 math.h header.

These days with Visual Studio 2013 (msvc12) being out, Microsoft has a proper C99 compliant math.h C header file. For anyone using any C99 math function I would first recommend you do so using msvc12. However recently I was working on a project that wanted to support older versions of msvc so I wrote up some compatibility code that added the additional missing functions that were added to math.h in C99. Since older msvc versions are C89 compliant they are missing many functions that were added in C99. However some of these functions aren't actually missing they were just added to the header using a different name (often with an '_' prefix). So if you know where these functions are you can make them usable in a C99 way.

So here is some code that can be added under a normal math.h include to add missing C99 functions. Not all of them are here and those that are missing are identified with a simple comment. But many of the commonly used ones are provided so hopefully this may be useful for someone.

#if _MSC_VER > 1800
// MSVC 11 or earlier does not define a C99 compliant math.h header.
// Missing functions are included here for compatibility.
#include 
static __inline double acosh(double x){
    return log(x + sqrt((x * x) - 1.0));
}
static __inline float acoshf(float x){
    return logf(x + sqrtf((x * x) - 1.0f));
}
#   define acoshl(x) acosh(x)
static __inline double asinh(double x){
    return log(x + sqrt((x * x) + 1.0));
}
static __inline float asinhf(float x){
    return logf(x + sqrtf((x * x) + 1.0f));
}
#   define asinhl(x) asinh(x)
static __inline double atanh(double x){
    return (log(1.0 + x) - log(1.0 - x)) / 2;
}
static __inline float atanhf(float x){
    return (logf(1.0f + x) - logf(1.0f - x)) / 2.0f;
}
#define atanhl(x) atanh(x)
static __inline double cbrt(double x){
    return (x > 0.0) ? pow(x, 1.0 / 3.0) : -pow(-x, 1.0 / 3.0);
}
static __inline float cbrtf(float x){
    return (x > 0.0f) ? powf(x, 1.0f / 3.0f) : -powf(-x, 1.0f / 3.0f);
}
#define cbrtl(x) cbrt(x)
#define copysign(x,s) _copysign(x,s)
#define copysignf(x,s) _copysign(x,s)
#define copysignl(x,s) _copysignl(x,s)
static __inline double erf(double x){
    double a1 = 0.254829592, a2 = -0.284496736, a3 = 1.421413741;
    double a4 = -1.453152027, a5 = 1.061405429, p = 0.3275911;
    double t, y;
    int sign = (x >= 0) ? 1 : -1;
    x = fabs(x);
    t = 1.0 / (1.0 + p*x);
    y = 1.0 - (((((a5 * t + a4 ) * t) + a3) * t + a2) * t + a1) * t * exp(-x * x);
    return sign*y;
}
static __inline float erff(float x){
    return erf((float)x);
}
#define erfl(x) erf(x)
// erfc
static __inline double exp2(double x){
    return pow(2.0, x);
}
static __inline float exp2f(float x){
    return powf(2.0f, x);
}
#define exp2l(x) exp2(x)
static __inline double expm1(double x){
    if(fabs(x) < 1e-5)
        return x + 0.5 * x * x;
    else
        return exp(x) - 1.0;
}
static __inline float expm1f(float x){
    if(fabsf(x) < 1e-5f)
        return x + 0.5f * x * x;
    else
        return expf(x) - 1.0f;
}
#define expm1l(x) expm1(x)
static __inline double fdim(double x, double y){
    return (x > y) ? x - y : 0.0;
}
static __inline float fdimf(float x, float y){
    return (x > y) ? x - y : 0.0f;
}
#define fdiml(x,y) fdim(x,y)
static __inline double fma(double x, double y, double z){
    return ((x * y) + z);
}
static __inline float fmaf(float x, float y, float z){
    return ((x * y) + z);
}
#define fmal(x,y,z) fma(x,y,z)
static __inline double fmax(double x, double y){
    return (x > y) ? x : y;
}
static __inline float fmaxf(float x, float y){
    return (x > y) ? x : y;
}
#define fmaxl(x,y) fmax(x,y)
static __inline double fmin(double x, double y){
    return (x < y) ? x : y;
}
static __inline float fminf(float x, float y){
    return (x < y) ? x : y;
}
#define fminl(x,y) fmin(x,y)
#ifndef _HUGE_ENUF
#    define _HUGE_ENUF 1e+300
#endif   
#define INFINITY   ((float)(_HUGE_ENUF * _HUGE_ENUF))  /* causes warning C4756: overflow in constant arithmetic (by design) */
#define NAN        ((float)(INFINITY * 0.0F))
#define FP_INFINITE  1
#define FP_NAN       2
#define FP_NORMAL    (-1)
#define FP_SUBNORMAL (-2)
#define FP_ZERO      0
#define fpclassify(x) ((_fpclass(x)==_FPCLASS_SNAN)?FP_NAN:((_fpclass(x)==_FPCLASS_QNAN)?FP_NAN:((_fpclass(x)==_FPCLASS_QNAN)?FP_NAN: \
 ((_fpclass(x)==_FPCLASS_NINF)?FP_INFINITE:((_fpclass(x)==_FPCLASS_PINF)?FP_INFINITE: \
 ((_fpclass(x)==_FPCLASS_NN)?FP_NORMAL:((_fpclass(x)==_FPCLASS_PN)?FP_NORMAL: \
 ((_fpclass(x)==_FPCLASS_ND)?FP_SUBNORMAL:((_fpclass(x)==_FPCLASS_PD)?FP_SUBNORMAL: \
 FP_ZERO)))))))))
#define hypot(x,y) _hypot(x,y)
#define hypotf(x,y) _hypotf(x,y)
 // ilogb
#define isfinite(x) _finite(x)
#define isnan(x) (!!_isnan(x))
#define isinf(x) (!_finite(x) && !_isnan(x))
#define isnormal(x) ((_fpclass(x) == _FPCLASS_NN) || (_fpclass(x) == _FPCLASS_PN))
#define isgreater(x,y)      ((x) > (y))
#define isgreaterequal(x,y) ((x) >= (y))
#define isless(x,y)         ((x) < (y))
#define islessequal(x,y)    ((x) <= (y))
#define islessgreater(x,y)  (((x) < (y)) || ((x) > (y)))
#define isunordered(x,y)    (_isnan(x) || _isnan(y))
#define j0(x) _j0(x)
#define j1(x) _j1(x)
#define jn(x,y) _jn(x,y)
// lgamma
static __inline double log1p(double x){
    if(fabs(x) > 1e-4){
        return log(1.0 + x);
    }
    return (-0.5 * x + 1.0) * x;
}
static __inline float log1pf(float x){
    if(fabsf(x) > 1e-4f){
        return logf(1.0f + x);
    }
    return (-0.5f * x + 1.0f) * x;
}
#define log1pl(x) log1p(x)
static __inline double log2(double x) {
    return log(x) * M_LOG2E;
}
static __inline float log2f(float x) {
    return logf(x) * (float)M_LOG2E;
}
#define log2l(x) log2(x)
#define logb(x) _logb(x)
#define logbf(x) _logb(x)
#define logbl(x) _logb(x)
// nearbyint
#define nextafter(x,y) _nextafter(x,y)
#define nextafterf(x,y) _nextafter(x,y)
// nexttoward

static __inline double rint(double x){
    const double two_to_52 = 4.5035996273704960e+15;
    double fa = fabs(x);
    if(fa >= two_to_52){
        return x;
    } else{
        return copysign(two_to_52 + fa - two_to_52, x);
    }
}
static __inline float rintf(float x){
    const double two_to_52 = 4.5035996273704960e+15f;
    double fa = fabsf(x);
    if(fa >= two_to_52){
        return x;
    } else{
        return copysignf(two_to_52 + fa - two_to_52, x);
    }
}
#define rintl(x) rint(x)static __inline double remainder(double x, double y){
    return (x - ( rint(x / y) * y ));
}
static __inline float remainderf(float x, float y){
    return (x - ( rintf(x / y) * y ));
}



#define remainderl(x) remainder(x)
static __inline double remquo(double x, double y, int* q){
    double d = rint(x / y);
    q = (int)d;
    return (x - (d * y));

}


static __inline float remquof(float x, float y, int* q){
    float f = rintf(x / y);
    q = (int)f;
    return (x - (f * y));

}

#define remquo(x) remquo(x)
static __inline double round(double x){
    return ((x > 0.0) ? floor(x + 0.5) : ceil(x - 0.5));
}
static __inline float roundf(float x){
    return ((x > 0.0f) ? floorf(x + 0.5f) : ceilf(x - 0.5f));
}
#define roundl(x) round(x)
// scalbn
#define signbit(x) (_copysign(1.0, x) < 0)
// tgamma
static __inline double trunc(double x){
    return (x > 0.0) ? floor(x) : ceil(x);
}
static __inline float truncf(float x){
    return (x > 0.0f) ? floorf(x) : ceilf(x);
}
#define truncl(x) trunc(x)
#define y0(x) _y0(x)
#define y1(x) _y1(x)
#define yn(x,y) _yn(x,y)
static __inline long lrint(double x){
    return (long)rint(x);
}
static __inline long lrintf(float x){
    return (long)rintf(x);
}
define lrintl(x) lrint(x)
static __inline long lround(double x){
    return (long)round(x);
}
static __inline long lroundf(float x){
    return (long)roundf(x); 
}
#define lroundl(x) lround(x)
static __inline long long llrint(double x){
    return (long long)rint(x);
}
static __inline long long llrintf(float x){
    return (long long)rintf(x);
}
#define llrintl(x) llrint(x)
static __inline long long llround(double x){
    return (long long)round(x);
}
static __inline long long llroundf(float x){
    return (long long)roundf(x);
}
#define llroundl(x) llround(x)
#endif

Saturday, 3 May 2014

Building FFmpeg on Windows with in-line asm and the Intel compiler (Part 3).

Previously I have posted about efforts to build FFmpeg natively under windows with inline assembly enabled and using the Intel compiler:

Building FFmpeg on Windows with in-line asm and the Intel compiler.
Building FFmpeg on Windows with in-line asm and the Intel compiler (Part 2).

After many months of patching, testing and review the current upstream FFmpeg master is now fully updated to support inline asm compilation with Intel compiler. This means that my patches do not need to be explicitly applied as now the default FFmpeg repository has been updated to include all my changes. So now those people who have access to the Intel compiler on Windows can build FFmpeg using the latest source and will automatically have all the hand-tuned assembly optimizations built in without any extra effort.

For those interested in testing it out they can grab the latest FFmpeg source from their git master. Normally FFmepg must be built through MSYS/Cygwin on Windows so if your not that way inclined you can also check out my repository which includes a copy of FFmpeg master with some additional visual studio project files added in. With these you can now compile FFmpeg natively directly through Visual Studio.

Building FFmpeg in Visual Studio.

These changes are all part of work designed to improve the state of FFmpeg on Windows. Also added to upstream FFmpeg are patches to enable OpenCL support with native win32 threads as well as several patches for dependency library linking errors (libvpx, libssh to name a few). It took a little while to get all these patches approved (asm patches to upstream libmpcodecs took a particularly long time) but as of this morning the libmpcodec changes were pulled into mainstream FFmpeg which was the last change required for full icl support. A big shout out to Michael Niedermayer for reviewing and providing useful feedback on all the patches. He was a big help in pointing me in the right direction and for spotting all my mistakes (a side affect of me writing and submitting many of the patches in the wee hours of the morning).

As always you can grab the code from my repository and feel free to post any bugs/errors you may encounter.
https://github.com/ShiftMediaProject/FFmpeg

Monday, 14 April 2014

Intel and Microsoft compiler linker errors (already defined in)

Recently I ran into a rather strange error that took me a little while to track down. So to make it easier for anyone else who runs into this then Ill describe the error and its resolution here.

Basically the error was during linking a program and showed up as:

LIBCMT.lib(log10.obj) : error LNK2005: _log10 already defined in libmmt.lib(log10_stub.obj)

This was just one of may errors that decided to pop up (basically the same errors but for things like sin/cos etc.). This was a rather odd error and it pretty much doesn't matter what linker options you change there is no way to save you from this one (don't bother trying to ignore specific libs as that will just make things worse - trust me). The problem was due to a program that includes static libs where 1 of those libs was created by the standard Microsoft compiler and the other was created by the Intel compiler. The problem is that each of these input libs declared the same function but in different ways (one the Intel way the other the Microsoft way).

The issue revolves around the standard C library (obviously I was writing C/C++ code). When using the Microsoft compiler the standard C library is libcXX where XX can change based on different settings. In the above example I was using the standard multi-threaded version (hence the mt appended to the end of libc). The Intel compiler also defines it own implementation of various standard C functions which it puts in its own libmXX. So the above problem arises because these two libs are clashing with each other. This would not be a problem if both input libs were compiled with the same compiler but if that is not an option then we have to find a solution.

Luckily Intel is not entirely clueless here and there libmXX libraries are designed to play nice with the default Microsoft ones. So why the above error? Well the problem arises when each input library uses a different compilation option. There will always be a problem with mixing code generated against different libcXX versions even when using the same compiler but as you can see from above both are generated using the 'mt' version so all should be fine. The actual solution is due to the implementation of each of these clashing functions. Above the example was the log10 function which is only clashing because it is being used in different ways that can not be resolved between the Intel and Microsoft compiler.

The culprit is a setting called "FloatingPointModel". If building through Visual Studio then this setting can be found under "C/C++->Code Generation" in the project properties. The issue occurs when 1 of the input libs was compiled using a different floating point model than the other lib. For instance if one of the libs was set to use Intel compilers "Fast2" setting then it will use the appropriate optimizations and optimized functions from within libmXX. However if the other lib was compiled using a different floating point model then it will try and use that version of the function. The problem is that they both have the same function name but have different implementations in different compiler libc implementations. All of a sudden libmXX and libcXX wont play nice together as they are both trying to use different optimized versions of a math function that has the same name. The linker doesn't know which one to use and so errors abound.

The solution is simply making sure that all input libs are compiled using the same setting for the floating point model. This rules out using Intels "Fast2" option as this is never available on Microsoft's so it will always cause problems. But apart from that making sure they are all the same value (or not setting any value) should make this rather rare and frustrating bug go away.

Friday, 14 March 2014

x264 Performance with Different Compilers

Recently I made a post about compiling x264 natively using Visual Studio under Windows. With that post I also made available my git repository that includes all the necessary Visual Studio projects to get x264 up and compiling under Windows. Since the default build chain for x264 uses a gcc based compiler (which is MinGW on Windows) I wondered if there was any performance difference between the generated binaries.

So I set out by building the x264 command line tool using both MSVC from Visual Studio 2013, Intel Compiler XE 2013 SP1 and MinGW 4.8.2. To test I passed the first minute of Big Buck Bunny from the 1080p mp4 file found on the main site (www.bigbuckbunny.org/index.php/download/). I then did a CRF=20.0 encode using the 'Very Slow' pre-set. This may not exactly be the most exhaustive test but I wasn't feeling like waiting around for 10+ hour encodes.

The results can be seen for themselves:

MinGW: 3min 5sec
MSVC: 3min 5sec
ICL: 3min 6sec
ICL (O3): 3min 5sec

ICL shows up twice because I did the first one using the default compilation options that MSVC was using. I then did a second build using the higher compilation options that Intel compiler can support (a.k.a I set the /O3 compile option as opposed to the default /O2).

As you can see the difference is negligible (there so little to see here I didn't even bother making a nice table or graph). In fact the only variations was with ICL and that was probably within the error of the test measurement anyway so for all intents and purposes all the final builds performed identically. This is not too surprising as x264 uses a lot of hand tuned assembly in the key bits which there isn't much more that any of the compilers could do with. The result is that the code is optimised as much as possible so there is no room for the compiler to improve it. Therefore the performance is pretty much consistent across all compilers.

Of course Ill be the first to admit that this was a pretty quick and dirty test. Its quite possible that after a 10+ hour encode some larger differences may start to appear. However based on these values it is unlikely that those differences would be very substantial (a couple of minutes here and there over a 10+ hour run).

So if you were expecting (or hoping) for big differences then I'm sorry to disappoint. What can be taken away from this is that it appears that it doesn't matter what compiler you use to build x264 you will still achieve the same performance. So for those who prefer GCC they can continue to MSYS it up, while those who prefer Visual Studio can do so happily as there is no performance downside in doing so.

Thursday, 13 March 2014

Building x264 on Windows with Visual Studio

For anyone who has ever done any video encoding (or just watching for that matter) will probably have heard of x264. And if you havnt then you should have at least have heard of h264 (and if you still havnt then this post is probably not for you) which is the standard name for the video codec that the x264 encoder implements. So in the world of h264 video encoding x264 is probably one of, if not the best. And it is completely free and open source. So anyone who wants to get their hands on it can do so easily.

One downside of the x264 project (much like most open source projects) is that the default build tool is a gnu make style build chain. These don't run natively on Windows and generally requires an emulated shell such as MSYS or Cygwin. And even from within these shells they only support gcc (MinGW on Windows) based compilers.

But for those wanting to compile x264 natively on Windows using the native Windows build chain (i.e. Visual Studio) then that can actually be done rather simply. For those with Visual Studio 2013 then compiling many similar open source projects becomes a lot easier due to the addition of partial C99 support. C99 is something Microsoft have been neglecting for many years but with the 2013 updates it is a lot closer. x264 however still requires some manipulation in order to get it compile under MSVC's mostly C89 world.
Luckily changing x264 for be MSVC compatiblilty is rather straightforward. Most of the issues are a result of C89 requiring all variable declarations to be altogether and the start of each logical block of code. Straight of the bat most of the errors that Visual Studio will spit out about the x264 code will be a result of this issue (although the error codes don't do you any favours in realising this). So most errors will be due to code such as the following:

int padv = PADV >> v_shift;
// buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
if( b_end && !b_start )
    height += 4 >> (v_shift + SLICE_MBAFF);
pixel *pix;
int starty = 16*mb_y - 4*!b_start;

This will generate an error on the variable 'pix' because it is declared mid way through a block of code. Luckily the specifications say 'block' of code, which does not mean function or something similar. Instead it essential means anything between a set of '{' or '}'s (There iss actually a bit more to it than that but for our purposes - as you'll see later - it is good enough). In the above example we can see that the declaration of 'pix' occurs after an if statement. So if all we need to separate blocks are some '{}'s then modifying the code to the following will actually work:

int padv = PADV >> v_shift;
// buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
if( b_end && !b_start )
{
    height += 4 >> (v_shift + SLICE_MBAFF);
}
pixel *pix;
int starty = 16*mb_y - 4*!b_start;

All we did here was add the '{' and the '}' to the if statement. This makes that statement a block and so the line following it becomes a new block which makes everything work. This is surprisingly simple fix and will work for all of the cases found in libx264. In fact a complete working libx264 can be achieved by just performing the above operation at 10 different location in code. Or if you couldn't be bothered doing it yourself you can just apply the following patch that I have already made up for you.

Download patch file:
https://github.com/ShiftMediaProject/x264/commit/d9004ba604283fb70a3b67d444f67576c00a0e2e.diff

Now for those who don't just want the lib for x264 but actually want to compile the command line x264.exe then you'll have to perform the same operation a few more times. However there are 2 additional things youll need to do.
First is related to an issue with the use of unions. Specifically the following piece of code:

return (union {double f; uint64_t i;}){value}.i;

The above is too much for MSVC to handle. But with a bit of massaging it can be made to work. Massaging such as this:

union { double f; uint64_t i; } ulld = { value };
return ulld.i

The second issue is due to initialization lists being used on an array of structs. MSVC defaults to thinking that each element in the initializer list is actually an input for each component for the actual struct. So in the following piece of code MSVC treats the initializer list as actually an initializer list for the first 'AVS_Value' in the array.

AVS_Value arg_arr[] = { res, avs_new_value_bool( info->interlaced ), avs_new_value_string( matrix ) };

This will cause some nonsensical errors such as how a type of AVS_Value can not be converted to type short (short here being the type of the first member of AVS_Value). There is pretty much no combination of additional '('s and '{' that will fix this problem. So we have to fallback to the slightly less convenient way of just specifying each array element individually.

AVS_Value arg_arr[3];
arg_arr[0] = res;
arg_arr[1] = avs_new_value_bool( info->interlaced );
arg_arr[2] = avs_new_value_string( matrix );

This is not as nice to look at but it works. Putting all these pieces together and x264cli will compile under Visual Studio without any further problems. Again for those who dont want to do all this themselves then the appropriate patch can be acquired from below.

Download patch file:
https://github.com/ShiftMediaProject/x264/commit/4c51a4fc51737d932eddb4060bcd03d861dfec7d.diff

Of course if you don't want to worry about any of the above then you can check out my git repository that has all the necessary changes already applied and even comes with a pre-built Visual Studio project file. My repo is up to date with the current upstream master and with x264 development slowing down due to the upcoming x265 then its unlikely my repo will fall behind the upstream master so can be treated as up to date.

git repository:
https://github.com/ShiftMediaProject/x264

So with all of those above fixes it should be reasonable trivial to get x264 building natively in Windows. As to whether this is worth it is a debate for another time. For those who aren't familiar with MSYS/Cygwin but are familiar with Visual Studio then this should be a big win. What will be interesting is to test the performance of the compiled binaries between the different compilers. Perhaps ill have more on that soon.....

Saturday, 25 January 2014

Building FFmpeg in Visual Studio

The default build chain for the FFmpeg project uses the standard (well standard for gnu open source projects) gnu autotools. The use of configure and make may be rather familiar to those who compile often on linux but for many Windows developers these can seem like somewhat alien tools. This is compounded by the fact that to use these tools on Windows requires setting up a MSYS/Cygwin environment which can often be easier said than done. Even after that most build chains using this environment require a gcc based compiler which on Windows is MinGW. Gcc is a good compiler but MinGW can have some issues (which in its defence is generally always around Windows specific things) which can make it less than ideal.

FFmpegs default build tools do currently support compiling natively in msvc (Microsoft's C compiler) and will even convert the C99 FFmpeg code to msvc compliant C89 code. But this still requires setting up a MSYS shell and any additional FFmpeg dependencies don't offer msvc support from within the same build tools.

So as powerful as the configure/make build chains can be and say what you like about the msvc compiler (I agree its standards compliance is abysmal) but for those wanting to develop natively on Windows having Visual Studio support is the simplest and most robust. Since FFmpeg wont maintain a native Visual Studio build chain (and I cant blame them for not wanting to), I got a little bored and decided to take it upon myself to provide an alternative.

The result of some holiday free time is that I wrote up a FFmpeg Visual Studio project generator. This is a simple piece of software that will scan in the existing FFmpeg configure/make files and then use those to dynamically generate a visual studio project file that can be used to natively compile FFmpeg. This program not only builds the project files but will also generate the config files dynamically based on a combination of rules found in the original configure script and any passed in arguments. In fact the generator excepts many of the same configuration arguments as the configure script allowing fine-tuned control of what gets enabled/disabled.
Example command line options:

ffmpeg_generator.exe --enable-gpl --enable-bzlib --enable-iconv 
   --enable-zlib --disable-encoders --enable-encoder=vorbis

The program doesn't support all the options of the configure script but it supports many of them and in some cases actually exposes more than the original. Any option that shows up in the config.h file can be manually enabled/disabled through the command line. And even if an invalid option is set the generator uses all the inbuilt configuration checks found in the original build script to automatically validate each setting. Options such as 32/64bit are some of the options that are not supported, mainly because they are not relevant as the generated config.h file can be used for both 32/64 bit and even detects and enables inline asm if the Intel compiler is made available.

#define ARCH_X86 1
#if defined( __x86_64 ) || defined( _M_X64 )
#   define ARCH_X86_32 0
#else
#   define ARCH_X86_32 1
#endif

All of this happens dynamically so for those building from git master, once any commits are and added to your repo you just have to rerun the generator program and it will automatically detect any changes (new/deleted files, configuration changes, new/deleted options etc.) and generate a new project accordingly.

Now I should point out that this generator was rather quickly (and half-assed) thrown together so its not exactly very brilliant code. But for something quick and dirty it gets the job done. It supports all current additional dependencies but not all have been checked and it takes certain liberties with respect to library naming. This is because many dependency libraries don't have consistent naming conventions so the link include the generated project uses may not be exactly the same as the actually file you are linking against. Should this happen then you'll just have to manually tweak the file-name in the include option as without any kind of naming consistency there's not much that can be done about it.

The generator also by default generates a project with default Intel compiler settings. Those without Intel compiler may have to change a few settings in the project properties if they want to set it back to the standard msvc. Intel is chosen as default as Visual Studio 2012 doesn't support enough C99 features to be able to compile FFmpeg so only the Intel compiler can be used with 2012 to build the project. For those with Visual Studio 2013 the default compiler adds enough C99 to be able to get it to work but for the moment the generator is built to default to 2012. The same project can be loaded in both 2012/2013 and all that needs to be changed is the compiler being used. If there is enough interest I may add an option to the generator to allow for people to specify whether they want Intel support or not at generation time but in the meantime you'll just have to change the build tool in the project properties.

Update: The current version of the generator allows for specifying the compiler that will be used as default in the output project file. Of course this can also be changed directly in the project after it is generated but for convenience the "toolchain" option is now processed by the generator. With newer patches to FFmpeg Visual Studio 2013 can compile it without problems. The toolchian parameter accepts either "msvc" for default Microsoft compiler or "icl" for the Intel compiler which also supports inline assembly.

ffmpeg_generator.exe --toolchain=msvc

The project generator can be found in my git repo below. Also in the repo is a pre-built project that is built using the following command options:

ffmpeg_generator.exe --enable-gpl --enable-version3 --enable-avisynth
   --enable-nonfree --enable-bzlib --enable-iconv --enable-zlib
   --enable-libmp3lame --enable-libvorbis --enable-libspeex
   --enable-libopus --enable-libfdk-aac --enable-libtheora
   --enable-libx264 --enable-libxvid --enable-libvpx --enable-libmodplug
   --enable-libsoxr --enable-libfreetype --enable-fontconfig 
   --enable-libass --enable-openssl --enable-librtmp --enable-libssh

Update: The default projects now include libcdio, libbluray, opengl, opencl and sdl enabled. More will be enabled as they are tested. All of these dependencies have working Visual Studio projects found in the SMP directories in each of their repos found at the parent ShiftMediaProject repository.

Your free to use this project directly as I keep it up to date and all the necessary dependency projects can also be found in my github.

git repository:
https://github.com/ShiftMediaProject/FFmpeg

Sunday, 5 January 2014

Building FFmpeg on Windows with in-line asm and the Intel compiler (Part 2).

In my last post I talked about how to get FFmpeg to compile under Windows with the Intel compiler. This has advantages in that the Intel compiler supports compilation of AT&T style inline assembly. This means that its possible to use the hand tuned optimised code found in FFmpeg while natively compiling for Windows (something that is otherwise not possible). Unfortunately the assembly support in the Windows version of the compiler is not complete and so the inline asm wont compile without some changes.

The previous post on this subject had a patch that allowed for compilation with inline asm. However since then I have cleaned up the patch and fixed a few things that I wasnt entirely happy about. In fact after talking to Michael Niedermayer and others on the FFmpeg mailing list I ended up writing an entirely new patch. This patch is currently still pending review but for those who are interested in getting this working now they can grab the patch from the end of this post.

The main changes in this version is the way the inline asm was changed from using direct symbol references to something defined in an asm-interface. From my previous post I mentioned that Intel compiler does not support direct symbol references in code so previously I had changed all of these to asm-interfaces. However the FFmpeg developers didn't want to change any existing code and there was some concerns over how moving variables from direct symbols into the interface may affect Position Independent Code compilation. So based on a suggestion from Michael the patch was changed to use named constraints. For those familiar with named constraints you'll know that to use these all you have to do is replace a direct symbol reference with a named constraint.

Example of existing direct symbol reference:

"movq "MANGLE(ff_pb_80)", %%mm0     \n\t"
"lea         (%3, %3, 2), %1        \n\t"

Since FFmpeg already had a macro for name mangling the direct symbol references it was rather simple to just change this macro to generate a named constraint instead. In order to do this the definition of MANGLE was changed so that with Intel compiler on Windows it generates a named constraint.

// Determine if compiler supports direct symbol references in inline asm
#if defined(__INTEL_COMPILER) && defined(_MSC_VER)
#   define HAVE_DIRECT_SYMBOL_REF 0
#else
#   define HAVE_DIRECT_SYMBOL_REF 1
#endif

#if HAVE_DIRECT_SYMBOL_REF
    //The standard version of MANGLE for direct symbol references
#   define MANGLE(a) EXTERN_PREFIX LOCAL_MANGLE(a)
#else
    //A version of mangle that instead generates named constraints
#   define MANGLE(a) "%["#a"]"
#endif

Of course using a named constraint by itself wont work as the constraint still needs to be added to the asm-interface. Since this additional interface is only required for Intel on Windows it is not desirable to have it all the time. So in keeping with not changing any existing code the asm-interfaces were added using a new macro that simply does nothing for all other build chains. Using this the above inline asm becomes:

__asm__ volatile (
   "movq "MANGLE(ff_pb_80)", %%mm0     \n\t"
   "lea         (%3, %3, 2), %1        \n\t"
   put_signed_pixels_clamped_mmx_half(0)
   "lea         (%0, %3, 4), %0        \n\t"
   put_signed_pixels_clamped_mmx_half(64)
   : "+&r"(pixels), "=&r"(line_skip3)
   : "r"(block), "r"(line_skip)
     NAMED_CONSTRAINTS_ADD(ff_pb_80)
   : "memory"
);

The resulting changes are rather minimal and when compiling using any previously supported build chains there is no apparent difference in the code before and after this change. The macro NAMED_CONSTRAINTS is used to add each of the names of any directly accessed symbols. This macro just needs the name and can take a comma separated list of up to 10 values.

However adding the macro NAMED_CONSTRAINTS required slightly more work than I would have liked but it at least worked as required. The difficulty was due to a bug in both the Intel and Microsoft compilers where variadic arguments where not properly expanded (see my post on the subject http://siliconandlithium.blogspot.com/2014/01/macro-variadic-argument-expansion-on.html). Using the working variadic for-each from my previous post the implementation of NAMED_CONSTRAINTS is:

#if HAVE_DIRECT_SYMBOL_REF
#   define NAMED_CONSTRAINTS_ADD(...)
#   define NAMED_CONSTRAINTS(...)
#else
#   define NAME_CONSTRAINT(x) [x] "m"(x)
    // Parameters are a list of each symbol reference required
#   define NAMED_CONSTRAINTS_ADD(...) , FOR_EACH_VA(NAME_CONSTRAINT,__VA_ARGS__)
    // Same but without comma for when there are no previously defined constraints
#   define NAMED_CONSTRAINTS(...) FOR_EACH_VA(NAME_CONSTRAINT,__VA_ARGS__)
#endif

Putting this all together and for the most part all existing inline asm will compile without any problems. However you'll notice I said 'most'. The Intel compiler is extremely fussy about the use of inline AT&T assembly. This is most likely because this is considered a rarely used feature and so does not see much in the way of support. Unfortunately this means that using inline asm is much harder than it needs to be. So even after the missing support for direct symbol references the compiler still has many issues. The main one is that the compiler will sporadically decide it doesn't like some particular assembly line and generate an error. The same assembly line will be working fine until you change a compilation option (or in some cases just move the assembly block somewhere else) then all of a sudden it will generate an error. Ideally it would be nice if Intel cleaned up some these inconsistencies but in the mean time the patch had to work around them.

So the patch does change a couple of things. In fact there are 2 instances (in x86/motion_est.c and vf_fspp.c) where a direct symbol reference had to be removed and replaced with an asm-interface. This may be seen as changing the original code but in both instances the variable in question already existed in an asm-interface so there additional inclusion should have absolutely zero affect. In fact just to be sure I checked the generated code from gcc before and after the patch and ensured that every line was identical.

The patch passes all FATE tests and compiles on Intel (under normal release compilation options) and has zero impact on any other build chain. For those interested the full patch can be downloaded below.

Update 2: New and improved patches have been created and these are now available directly in FFmpeg master (no patching required). See my post for more information (Building FFmpeg on Windows with in-line asm and the Intel compiler (Part 3)).
Update: A new patch is available that adds support for an extra file. This file is only used when FAST_CMOV is enabled so was missed previously. The new patch should be grabbed from here:
https://github.com/ShiftMediaProject/FFmpeg/commit/26acab2672f27b923510ec35cbbade69a7776c9d.diff

Download patch file:
https://github.com/ShiftMediaProject/FFmpeg/commit/6eea177a4761629df5d6f02359fccc4efb116aed.diff

Note: Unlike my last patch this one is done directly against the current master (at time of writing).

Macro variadic argument expansion on Windows.

Recently while trying to write some code I decided to use a variadic macro. For those familiar with the concept you'll know that they can be very powerful. Unfortunately I ran into a slight problem when using them. That problem being that the Microsoft compiler (and the Intel compiler for Windows) were both incorrectly expanding the variadic arguments (__VA_ARGS__). Or more to the point they both were completely failing to expand the arguments. This meant that it was not directly possible to get each element individually as the compiler was essentially treating the whole variadic argument as one long string (commas included). This is directly in violation of what the standards require and so both compilers are being non-compliant. The Microsoft compiler has never embraced C99 so this should not be to much of a surprise but Intels failure here is a bit more egregious.

Luckily there is a way to get around this. It is not quite as elegant as what works on other compilers but it will at least work on Windows while still building on other build chains. So the example I will give is a for-each macro. This uses the variadic expansion and then sends each argument element to a specific macro. The trick here was that each expansion had to be explicitly handled. This means that if you want to support a variadic macro that can take up to 10 arguments then you need to create 10 explicit variants of the expansion. This limits the maximum number of arguments that can be supported (by however many expansions you can be bothered to write) and is also not very elegant but at least it will work.

#define FE_0(P,X) P(X)
#define FE_1(P,X,X1) P(X), FE_0(P,X1)
#define FE_2(P,X,X1,X2) P(X), FE_1(P,X1,X2)
#define FE_3(P,X,X1,X2,X3) P(X), FE_2(P,X1,X2,X3)
#define FE_4(P,X,X1,X2,X3,X4) P(X), FE_3(P,X1,X2,X3,X4)
#define FE_5(P,X,X1,X2,X3,X4,X5) P(X), FE_4(P,X1,X2,X3,X4,X5)
#define FE_6(P,X,X1,X2,X3,X4,X5,X6) P(X), FE_5(P,X1,X2,X3,X4,X5,X6)
#define FE_7(P,X,X1,X2,X3,X4,X5,X6,X7) P(X), FE_6(P,X1,X2,X3,X4,X5,X6,X7)
#define FE_8(P,X,X1,X2,X3,X4,X5,X6,X7,X8) P(X), FE_7(P,X1,X2,X3,X4,X5,X6,X7,X8)
#define FE_9(P,X,X1,X2,X3,X4,X5,X6,X7,X8,X9) P(X), FE_8(P,X1,X2,X3,X4,X5,X6,X7,X8,X9)
#define GET_FE_IMPL(_0,_1,_2,_3,_4,_5,_6,_7,_8,_9,NAME,...) NAME
#define GET_FE(A) GET_FE_IMPL A
#define GET_FE_GLUE(x, y) x y
#define FOR_EACH_VA(P,...) GET_FE_GLUE(GET_FE((__VA_ARGS__,FE_9,FE_8,FE_7,FE_6,FE_5,FE_4,FE_3,FE_2,FE_1,FE_0)), (P,__VA_ARGS__))

This requires 3 pieces. The first is GET_FE_IMPL which is used to pass the variadic arguments to the explicit expansion macro (FE_0, FE_1 etc.). The second piece is GET_FE. This macro just passes directly to GET_FE_IMPL and on other compiler is completely redundant but on Windows it is used to add an extra level of indirection that forces the input parameters to be correctly 'stringified'. Remove it and everything will fail horribly. The final piece required to get this to work is the GET_FE_GLUE macro. This is perform the same task as GET_FE in that it is required to correctly pass the variadic arguments. Both of these are required in order to get around the bugs in the compilers.

This new macro can be used to create any new macro that should perform a desired operation on each of the input arguments. All that is required is to call FOR_EACH_VA and pass it a macro followed by the variadic arguments. The passed macro will then be called and passed each individual element in the argument list.

Example:

#define DO_SOMETHING(x) /*Insert whatever code you wish to operate on the element 'x'*/
//The definition of the new variadic macro
#define DO_SOMETHING_VA(...) FOR_EACH_VA(DO_SOMETHING,__VA_ARGS__)

With the above you should be able to create any variadic macro you want.

Example usage:

//Perform DO_SOMETHING on each of the 3 arguments
DO_SOMETHING_VA(element1,element2,element3)

If you want to support an additional number of input arguments (the above only supports a maximum of 10) then all you have to do is add more explicit expansion macros (e.g. FE_10, FE_11 etc.) and then add these in sequence to GET_FE_IMPL and to FOR_EACH_VA.

Silicon & Lithium