|
|
/*****************************************************************************/
|
|
|
// Copyright 2006-2019 Adobe Systems Incorporated
|
|
|
// All Rights Reserved.
|
|
|
//
|
|
|
// NOTICE: Adobe permits you to use, modify, and distribute this file in
|
|
|
// accordance with the terms of the Adobe license agreement accompanying it.
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
#include "dng_utils.h"
|
|
|
|
|
|
#include "dng_area_task.h"
|
|
|
#include "dng_assertions.h"
|
|
|
#include "dng_bottlenecks.h"
|
|
|
#include "dng_flags.h"
|
|
|
#include "dng_globals.h"
|
|
|
#include "dng_host.h"
|
|
|
#include "dng_image.h"
|
|
|
#include "dng_mutex.h"
|
|
|
#include "dng_point.h"
|
|
|
#include "dng_rect.h"
|
|
|
#include "dng_simd_type.h"
|
|
|
#include "dng_tile_iterator.h"
|
|
|
|
|
|
#if qMacOS
|
|
|
#include <CoreServices/CoreServices.h>
|
|
|
#endif
|
|
|
|
|
|
#if qiPhone || qMacOS
|
|
|
// these provide timers
|
|
|
#include <mach/mach.h>
|
|
|
#include <mach/mach_time.h>
|
|
|
#endif
|
|
|
|
|
|
#if qiPhone || qLinux
|
|
|
#include <signal.h> // for raise
|
|
|
#endif
|
|
|
|
|
|
#if qWinOS
|
|
|
#include <windows.h>
|
|
|
#else
|
|
|
#include <sys/time.h>
|
|
|
#include <stdarg.h> // for va_start/va_end
|
|
|
#endif
|
|
|
|
|
|
#include <atomic>
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
#if qDNGDebug
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
#if qMacOS
|
|
|
#define DNG_DEBUG_BREAK __asm__ volatile ("int3")
|
|
|
#elif qiPhone
|
|
|
#if qiPhoneSimulator
|
|
|
// simulator is running on Intel
|
|
|
#define DNG_DEBUG_BREAK __asm__ volatile ("int3")
|
|
|
#else
|
|
|
// You'll be one level deeper in __kill. Works on Linux, Android too.
|
|
|
#define DNG_DEBUG_BREAK raise(SIGTRAP)
|
|
|
#endif
|
|
|
#elif qWinOS
|
|
|
// DebugBreak has to be emulated on WinRT
|
|
|
#define DNG_DEBUG_BREAK DebugBreak()
|
|
|
#elif qAndroid
|
|
|
#define DNG_DEBUG_BREAK raise(SIGTRAP)
|
|
|
#elif qLinux
|
|
|
#define DNG_DEBUG_BREAK raise(SIGTRAP)
|
|
|
#else
|
|
|
#define DNG_DEBUG_BREAK
|
|
|
#endif
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
void dng_show_message (const char *s)
|
|
|
{
|
|
|
// only append a newline if there isn't already one
|
|
|
const char* nl = "\n";
|
|
|
if (s[0] && (s[strlen(s)-1] == '\n'))
|
|
|
nl = "";
|
|
|
|
|
|
#if qDNGPrintMessages
|
|
|
|
|
|
// display the message
|
|
|
if (gPrintAsserts)
|
|
|
fprintf (stderr, "%s%s", s, nl);
|
|
|
|
|
|
#elif qiPhone || qAndroid || qLinux
|
|
|
|
|
|
if (gPrintAsserts)
|
|
|
fprintf (stderr, "%s%s", s, nl);
|
|
|
|
|
|
// iOS doesn't print a message to the console like DebugStr and MessageBox do, so we have to do both
|
|
|
// You'll have to advance the program counter manually past this statement
|
|
|
if (gBreakOnAsserts)
|
|
|
DNG_DEBUG_BREAK;
|
|
|
|
|
|
#elif qMacOS
|
|
|
|
|
|
if (gBreakOnAsserts)
|
|
|
{
|
|
|
// truncate the to 255 chars
|
|
|
char ss [256];
|
|
|
|
|
|
uint32 len = (uint32) strlen (s);
|
|
|
if (len > 255)
|
|
|
len = 255;
|
|
|
strncpy (&(ss [1]), s, len );
|
|
|
ss [0] = (unsigned char) len;
|
|
|
|
|
|
DebugStr ((unsigned char *) ss);
|
|
|
}
|
|
|
else if (gPrintAsserts)
|
|
|
{
|
|
|
fprintf (stderr, "%s%s", s, nl);
|
|
|
}
|
|
|
|
|
|
#elif qWinOS
|
|
|
|
|
|
// display a dialog
|
|
|
// This is not thread safe. Multiple message boxes can be launched.
|
|
|
// Should also be launched in its own thread so main msg queue isn't thrown off.
|
|
|
if (gBreakOnAsserts)
|
|
|
MessageBoxA (NULL, (LPSTR) s, NULL, MB_OK);
|
|
|
else if (gPrintAsserts)
|
|
|
fprintf (stderr, "%s%s", s, nl);
|
|
|
|
|
|
#endif
|
|
|
|
|
|
}
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
void dng_show_message_f (const char *fmt, ... )
|
|
|
{
|
|
|
|
|
|
char buffer [2048];
|
|
|
|
|
|
va_list ap;
|
|
|
va_start (ap, fmt);
|
|
|
|
|
|
vsnprintf (buffer, sizeof (buffer), fmt, ap);
|
|
|
|
|
|
va_end (ap);
|
|
|
|
|
|
dng_show_message (buffer);
|
|
|
|
|
|
}
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
#endif
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
uint32 ComputeBufferSize (uint32 pixelType,
|
|
|
const dng_point &tileSize,
|
|
|
uint32 numPlanes,
|
|
|
PaddingType paddingType)
|
|
|
{
|
|
|
|
|
|
// Convert tile size to uint32.
|
|
|
|
|
|
if (tileSize.h < 0 || tileSize.v < 0)
|
|
|
{
|
|
|
ThrowMemoryFull ("Negative tile size");
|
|
|
}
|
|
|
|
|
|
const uint32 tileSizeH = static_cast<uint32> (tileSize.h);
|
|
|
const uint32 tileSizeV = static_cast<uint32> (tileSize.v);
|
|
|
|
|
|
const uint32 pixelSize = TagTypeSize (pixelType);
|
|
|
|
|
|
// Add padding to width if necessary.
|
|
|
|
|
|
uint32 paddedWidth = tileSizeH;
|
|
|
|
|
|
if (paddingType == padSIMDBytes)
|
|
|
{
|
|
|
|
|
|
if (!RoundUpForPixelSize (paddedWidth,
|
|
|
pixelSize,
|
|
|
&paddedWidth))
|
|
|
{
|
|
|
ThrowOverflow ("Arithmetic overflow computing buffer size");
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
// Compute buffer size.
|
|
|
|
|
|
uint32 bufferSize;
|
|
|
|
|
|
if (!SafeUint32Mult (paddedWidth, tileSizeV, &bufferSize) ||
|
|
|
!SafeUint32Mult (bufferSize, pixelSize, &bufferSize) ||
|
|
|
!SafeUint32Mult (bufferSize, numPlanes, &bufferSize))
|
|
|
{
|
|
|
ThrowOverflow ("Arithmetic overflow computing buffer size");
|
|
|
}
|
|
|
|
|
|
return bufferSize;
|
|
|
|
|
|
}
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
real64 TickTimeInSeconds ()
|
|
|
{
|
|
|
|
|
|
#if qWinOS
|
|
|
|
|
|
// One might think it prudent to cache the frequency here, however
|
|
|
// low-power CPU modes can, and do, change the value returned.
|
|
|
// Thus the frequencey needs to be retrieved each time.
|
|
|
|
|
|
// Note that the frequency changing can cause the return
|
|
|
// result to jump backwards, which is why the TickCountInSeconds
|
|
|
// (below) also exists.
|
|
|
|
|
|
// Just plug in laptop when doing timings to minimize this.
|
|
|
// QPC/QPH is a slow call compared to rtdsc.
|
|
|
// but QPC/QPF is not tied to speed step, it's the northbridge timer.
|
|
|
// caching the invFrequency also avoids a costly divide
|
|
|
|
|
|
static real64 freqMultiplier = 0.0;
|
|
|
|
|
|
if (freqMultiplier == 0.0)
|
|
|
{
|
|
|
|
|
|
LARGE_INTEGER freq;
|
|
|
|
|
|
QueryPerformanceFrequency (&freq);
|
|
|
|
|
|
freqMultiplier = 1.0 / (real64) freq.QuadPart;
|
|
|
|
|
|
}
|
|
|
|
|
|
LARGE_INTEGER cycles;
|
|
|
|
|
|
QueryPerformanceCounter (&cycles);
|
|
|
|
|
|
return (real64) cycles.QuadPart * freqMultiplier;
|
|
|
|
|
|
#elif qiPhone || qMacOS
|
|
|
|
|
|
// cache frequency of high-perf timer
|
|
|
static real64 freqMultiplier = 0.0;
|
|
|
if (freqMultiplier == 0.0)
|
|
|
{
|
|
|
|
|
|
mach_timebase_info_data_t freq;
|
|
|
mach_timebase_info(&freq);
|
|
|
|
|
|
// converts from nanos to micros
|
|
|
// numer = 125, denom = 3 * 1000
|
|
|
freqMultiplier = ((real64)freq.numer / (real64)freq.denom) * 1.0e-9;
|
|
|
|
|
|
}
|
|
|
|
|
|
return mach_absolute_time() * freqMultiplier;
|
|
|
|
|
|
#elif qAndroid || qLinux
|
|
|
|
|
|
//this is a fast timer to nanos
|
|
|
struct timespec now;
|
|
|
clock_gettime(CLOCK_MONOTONIC, &now);
|
|
|
return now.tv_sec + (real64)now.tv_nsec * 1.0e-9;
|
|
|
|
|
|
#else
|
|
|
|
|
|
// Perhaps a better call exists. (e.g. avoid adjtime effects)
|
|
|
|
|
|
struct timeval tv;
|
|
|
|
|
|
gettimeofday (&tv, NULL);
|
|
|
|
|
|
return tv.tv_sec + (real64)tv.tv_usec * 1.0e-6;
|
|
|
|
|
|
#endif
|
|
|
|
|
|
}
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
real64 TickCountInSeconds ()
|
|
|
{
|
|
|
|
|
|
return TickTimeInSeconds ();
|
|
|
|
|
|
}
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
static std::atomic_int sTimerLevel (0);
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
void DNGIncrementTimerLevel ()
|
|
|
{
|
|
|
|
|
|
// This isn't thread coherent, multiple threads can create/destroy cr_timer
|
|
|
// causing the tabbing to be invalid. Imagecore disables this.
|
|
|
|
|
|
if (!gImagecore)
|
|
|
{
|
|
|
|
|
|
sTimerLevel++;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
int32 DNGDecrementTimerLevel ()
|
|
|
{
|
|
|
|
|
|
if (gImagecore)
|
|
|
{
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
{
|
|
|
|
|
|
return (int32) (--sTimerLevel);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
dng_timer::dng_timer (const char *message)
|
|
|
|
|
|
: fMessage (message )
|
|
|
, fStartTime (TickTimeInSeconds ())
|
|
|
|
|
|
{
|
|
|
|
|
|
DNGIncrementTimerLevel ();
|
|
|
|
|
|
}
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
dng_timer::~dng_timer ()
|
|
|
{
|
|
|
|
|
|
uint32 level = Pin_int32 (0, DNGDecrementTimerLevel (), 10);
|
|
|
|
|
|
if (!gDNGShowTimers)
|
|
|
return;
|
|
|
|
|
|
real64 totalTime = TickTimeInSeconds () - fStartTime;
|
|
|
|
|
|
#if defined(qCRLogging) && qCRLogging && defined(cr_logi)
|
|
|
|
|
|
if (gImagecore)
|
|
|
{
|
|
|
// Imagecore force includes cr_log and overrides DNG to go to its logging under a mutex.
|
|
|
// don't use indenting or fprintf to stderr, want these buffered
|
|
|
cr_logi("timer", "%s: %0.3f sec\n", fMessage, totalTime);
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
#endif
|
|
|
|
|
|
fprintf (stderr, "%*s%s: %0.3f sec\n", level*2, "", fMessage, totalTime);
|
|
|
|
|
|
}
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
real64 MaxSquaredDistancePointToRect (const dng_point_real64 &point,
|
|
|
const dng_rect_real64 &rect)
|
|
|
{
|
|
|
|
|
|
real64 distSqr = DistanceSquared (point,
|
|
|
rect.TL ());
|
|
|
|
|
|
distSqr = Max_real64 (distSqr,
|
|
|
DistanceSquared (point,
|
|
|
rect.BL ()));
|
|
|
|
|
|
distSqr = Max_real64 (distSqr,
|
|
|
DistanceSquared (point,
|
|
|
rect.BR ()));
|
|
|
|
|
|
distSqr = Max_real64 (distSqr,
|
|
|
DistanceSquared (point,
|
|
|
rect.TR ()));
|
|
|
|
|
|
return distSqr;
|
|
|
|
|
|
}
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
real64 MaxDistancePointToRect (const dng_point_real64 &point,
|
|
|
const dng_rect_real64 &rect)
|
|
|
{
|
|
|
|
|
|
return sqrt (MaxSquaredDistancePointToRect (point,
|
|
|
rect));
|
|
|
|
|
|
}
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
dng_dither::dng_dither ()
|
|
|
|
|
|
: fNoiseBuffer ()
|
|
|
|
|
|
{
|
|
|
|
|
|
const uint32 kSeed = 1;
|
|
|
|
|
|
fNoiseBuffer.Allocate (kRNGSize2D * sizeof (uint16));
|
|
|
|
|
|
uint16 *buffer = fNoiseBuffer.Buffer_uint16 ();
|
|
|
|
|
|
uint32 seed = kSeed;
|
|
|
|
|
|
for (uint32 i = 0; i < kRNGSize2D; i++)
|
|
|
{
|
|
|
|
|
|
// The correct math for 16 to 8-bit dither would be:
|
|
|
//
|
|
|
// y = (x * 255 + r) / 65535; (0 <= r <= 65534)
|
|
|
//
|
|
|
// The bottlnecks are using a faster approximation of
|
|
|
// this math (using a power of two for the division):
|
|
|
//
|
|
|
// y = (x * 255 + r) / 65536; (255 <= r <= 65535)
|
|
|
//
|
|
|
// To insure that all exact 8 bit values in 16 bit space
|
|
|
// round trip exactly to the same 8-bit, we need to limit
|
|
|
// r values to the range 255 to 65535.
|
|
|
//
|
|
|
// This results in the dither effect being slightly
|
|
|
// imperfect, but correct round-tripping of 8-bit values
|
|
|
// is far more important.
|
|
|
|
|
|
uint16 value;
|
|
|
|
|
|
do
|
|
|
{
|
|
|
|
|
|
seed = DNG_Random (seed);
|
|
|
|
|
|
value = (uint16) seed;
|
|
|
|
|
|
}
|
|
|
while (value < 255);
|
|
|
|
|
|
buffer [i] = value;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
/******************************************************************************/
|
|
|
|
|
|
const dng_dither & dng_dither::Get ()
|
|
|
{
|
|
|
|
|
|
static dng_dither dither;
|
|
|
|
|
|
return dither;
|
|
|
|
|
|
}
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
void HistogramArea (dng_host & /* host */,
|
|
|
const dng_image &image,
|
|
|
const dng_rect &area,
|
|
|
uint32 *hist,
|
|
|
uint32 maxValue,
|
|
|
uint32 plane)
|
|
|
{
|
|
|
|
|
|
DNG_ASSERT (image.PixelType () == ttShort, "Unsupported pixel type");
|
|
|
|
|
|
DoZeroBytes (hist, (maxValue + 1) * (uint32) sizeof (uint32));
|
|
|
|
|
|
dng_rect tile;
|
|
|
|
|
|
dng_tile_iterator iter (image, area);
|
|
|
|
|
|
while (iter.GetOneTile (tile))
|
|
|
{
|
|
|
|
|
|
dng_const_tile_buffer buffer (image, tile);
|
|
|
|
|
|
const void *sPtr = buffer.ConstPixel (tile.t,
|
|
|
tile.l,
|
|
|
plane);
|
|
|
|
|
|
uint32 count0 = 1;
|
|
|
uint32 count1 = tile.H ();
|
|
|
uint32 count2 = tile.W ();
|
|
|
|
|
|
int32 step0 = 0;
|
|
|
int32 step1 = buffer.fRowStep;
|
|
|
int32 step2 = buffer.fColStep;
|
|
|
|
|
|
OptimizeOrder (sPtr,
|
|
|
buffer.fPixelSize,
|
|
|
count0,
|
|
|
count1,
|
|
|
count2,
|
|
|
step0,
|
|
|
step1,
|
|
|
step2);
|
|
|
|
|
|
DNG_ASSERT (count0 == 1, "OptimizeOrder logic error");
|
|
|
|
|
|
const uint16 *s1 = (const uint16 *) sPtr;
|
|
|
|
|
|
for (uint32 row = 0; row < count1; row++)
|
|
|
{
|
|
|
|
|
|
if (maxValue == 0x0FFFF && step2 == 1)
|
|
|
{
|
|
|
|
|
|
for (uint32 col = 0; col < count2; col++)
|
|
|
{
|
|
|
|
|
|
uint32 x = s1 [col];
|
|
|
|
|
|
hist [x] ++;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
{
|
|
|
|
|
|
const uint16 *s2 = s1;
|
|
|
|
|
|
for (uint32 col = 0; col < count2; col++)
|
|
|
{
|
|
|
|
|
|
uint32 x = s2 [0];
|
|
|
|
|
|
if (x <= maxValue)
|
|
|
{
|
|
|
|
|
|
hist [x] ++;
|
|
|
|
|
|
}
|
|
|
|
|
|
s2 += step2;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
s1 += step1;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
template <SIMDType simd>
|
|
|
class dng_limit_float_depth_task: public dng_area_task
|
|
|
{
|
|
|
|
|
|
private:
|
|
|
|
|
|
const dng_image &fSrcImage;
|
|
|
|
|
|
dng_image &fDstImage;
|
|
|
|
|
|
uint32 fBitDepth;
|
|
|
|
|
|
real32 fScale;
|
|
|
|
|
|
public:
|
|
|
|
|
|
dng_limit_float_depth_task (const dng_image &srcImage,
|
|
|
dng_image &dstImage,
|
|
|
uint32 bitDepth,
|
|
|
real32 scale);
|
|
|
|
|
|
virtual dng_rect RepeatingTile1 () const
|
|
|
{
|
|
|
return fSrcImage.RepeatingTile ();
|
|
|
}
|
|
|
|
|
|
virtual dng_rect RepeatingTile2 () const
|
|
|
{
|
|
|
return fDstImage.RepeatingTile ();
|
|
|
}
|
|
|
|
|
|
virtual void Process (uint32 threadIndex,
|
|
|
const dng_rect &tile,
|
|
|
dng_abort_sniffer *sniffer);
|
|
|
|
|
|
};
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
template <SIMDType simd>
|
|
|
dng_limit_float_depth_task<simd>::dng_limit_float_depth_task
|
|
|
(const dng_image &srcImage,
|
|
|
dng_image &dstImage,
|
|
|
uint32 bitDepth,
|
|
|
real32 scale)
|
|
|
|
|
|
: dng_area_task ("dng_limit_float_depth_task")
|
|
|
|
|
|
, fSrcImage (srcImage)
|
|
|
, fDstImage (dstImage)
|
|
|
, fBitDepth (bitDepth)
|
|
|
, fScale (scale)
|
|
|
|
|
|
{
|
|
|
|
|
|
}
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
template <SIMDType simd>
|
|
|
void dng_limit_float_depth_task<simd>::Process (uint32 /* threadIndex */,
|
|
|
const dng_rect &tile,
|
|
|
dng_abort_sniffer * /* sniffer */)
|
|
|
{
|
|
|
|
|
|
INTEL_COMPILER_NEEDED_NOTE
|
|
|
|
|
|
SET_CPU_FEATURE (simd);
|
|
|
|
|
|
dng_const_tile_buffer srcBuffer (fSrcImage, tile);
|
|
|
dng_dirty_tile_buffer dstBuffer (fDstImage, tile);
|
|
|
|
|
|
uint32 count0 = tile.H ();
|
|
|
uint32 count1 = tile.W ();
|
|
|
uint32 count2 = fDstImage.Planes ();
|
|
|
|
|
|
int32 sStep0 = srcBuffer.fRowStep;
|
|
|
int32 sStep1 = srcBuffer.fColStep;
|
|
|
int32 sStep2 = srcBuffer.fPlaneStep;
|
|
|
|
|
|
int32 dStep0 = dstBuffer.fRowStep;
|
|
|
int32 dStep1 = dstBuffer.fColStep;
|
|
|
int32 dStep2 = dstBuffer.fPlaneStep;
|
|
|
|
|
|
const void *sPtr = srcBuffer.ConstPixel (tile.t,
|
|
|
tile.l,
|
|
|
0);
|
|
|
|
|
|
void *dPtr = dstBuffer.DirtyPixel (tile.t,
|
|
|
tile.l,
|
|
|
0);
|
|
|
|
|
|
OptimizeOrder (sPtr,
|
|
|
dPtr,
|
|
|
srcBuffer.fPixelSize,
|
|
|
dstBuffer.fPixelSize,
|
|
|
count0,
|
|
|
count1,
|
|
|
count2,
|
|
|
sStep0,
|
|
|
sStep1,
|
|
|
sStep2,
|
|
|
dStep0,
|
|
|
dStep1,
|
|
|
dStep2);
|
|
|
|
|
|
const real32 *sPtr0 = (const real32 *) sPtr;
|
|
|
real32 *dPtr0 = ( real32 *) dPtr;
|
|
|
|
|
|
real32 scale = fScale;
|
|
|
|
|
|
bool limit16 = (fBitDepth == 16);
|
|
|
bool limit24 = (fBitDepth == 24);
|
|
|
|
|
|
for (uint32 index0 = 0; index0 < count0; index0++)
|
|
|
{
|
|
|
|
|
|
const real32 *sPtr1 = sPtr0;
|
|
|
real32 *dPtr1 = dPtr0;
|
|
|
|
|
|
for (uint32 index1 = 0; index1 < count1; index1++)
|
|
|
{
|
|
|
|
|
|
// If the scale is a NOP, and the data is packed solid, we can just do memory
|
|
|
// copy.
|
|
|
|
|
|
if (scale == 1.0f && sStep2 == 1 && dStep2 == 1)
|
|
|
{
|
|
|
|
|
|
if (dPtr1 != sPtr1) // srcImage != dstImage
|
|
|
{
|
|
|
|
|
|
memcpy (dPtr1, sPtr1, count2 * (uint32) sizeof (real32));
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
{
|
|
|
|
|
|
const real32 *sPtr2 = sPtr1;
|
|
|
real32 *dPtr2 = dPtr1;
|
|
|
INTEL_PRAGMA_SIMD_ASSERT_VECLEN_FLOAT(simd)
|
|
|
for (uint32 index2 = 0; index2 < count2; index2++)
|
|
|
{
|
|
|
|
|
|
real32 x = sPtr2 [0];
|
|
|
|
|
|
x *= scale;
|
|
|
|
|
|
dPtr2 [0] = x;
|
|
|
|
|
|
sPtr2 += sStep2;
|
|
|
dPtr2 += dStep2;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
// The data is now in the destination buffer.
|
|
|
|
|
|
if (limit16)
|
|
|
{
|
|
|
|
|
|
//start by using intrinsic __m256<35>_mm256_cvtph_ps<70>(__m128i<38>a)
|
|
|
//once the intrinsic is written, merge this branch with previous one
|
|
|
|
|
|
uint32 *dPtr2 = (uint32 *) dPtr1;
|
|
|
|
|
|
INTEL_PRAGMA_SIMD_ASSERT_VECLEN_INT32(simd)
|
|
|
|
|
|
for (uint32 index2 = 0; index2 < count2; index2++)
|
|
|
{
|
|
|
|
|
|
uint32 x = dPtr2 [0];
|
|
|
|
|
|
uint16 y = DNG_FloatToHalf (x);
|
|
|
|
|
|
x = DNG_HalfToFloat (y);
|
|
|
|
|
|
dPtr2 [0] = x;
|
|
|
|
|
|
dPtr2 += dStep2;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
else if (limit24)
|
|
|
{
|
|
|
|
|
|
uint32 *dPtr2 = (uint32 *) dPtr1;
|
|
|
|
|
|
for (uint32 index2 = 0; index2 < count2; index2++)
|
|
|
{
|
|
|
|
|
|
uint32 x = dPtr2 [0];
|
|
|
|
|
|
uint8 temp [3];
|
|
|
|
|
|
DNG_FloatToFP24 (x, temp);
|
|
|
|
|
|
x = DNG_FP24ToFloat (temp);
|
|
|
|
|
|
dPtr2 [0] = x;
|
|
|
|
|
|
dPtr2 += dStep2;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
sPtr1 += sStep1;
|
|
|
dPtr1 += dStep1;
|
|
|
|
|
|
}
|
|
|
|
|
|
sPtr0 += sStep0;
|
|
|
dPtr0 += dStep0;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
/******************************************************************************/
|
|
|
|
|
|
template <SIMDType simd>
|
|
|
void LimitFloatBitDepth (dng_host &host,
|
|
|
const dng_image &srcImage,
|
|
|
dng_image &dstImage,
|
|
|
uint32 bitDepth,
|
|
|
real32 scale)
|
|
|
{
|
|
|
|
|
|
DNG_ASSERT (srcImage.PixelType () == ttFloat, "Floating point image expected");
|
|
|
DNG_ASSERT (dstImage.PixelType () == ttFloat, "Floating point image expected");
|
|
|
|
|
|
dng_limit_float_depth_task<simd> task (srcImage,
|
|
|
dstImage,
|
|
|
bitDepth,
|
|
|
scale);
|
|
|
|
|
|
host.PerformAreaTask (task, dstImage.Bounds ());
|
|
|
|
|
|
}
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
template
|
|
|
void LimitFloatBitDepth<Scalar> (dng_host &host,
|
|
|
const dng_image &srcImage,
|
|
|
dng_image &dstImage,
|
|
|
uint32 bitDepth,
|
|
|
real32 scale);
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
#if qDNGIntelCompiler
|
|
|
|
|
|
template
|
|
|
void LimitFloatBitDepth<AVX2> (dng_host &host,
|
|
|
const dng_image &srcImage,
|
|
|
dng_image &dstImage,
|
|
|
uint32 bitDepth,
|
|
|
real32 scale);
|
|
|
|
|
|
#endif // qDNGIntelCompiler
|
|
|
|
|
|
/*****************************************************************************/
|
|
|
|
|
|
void LimitFloatBitDepth (dng_host &host,
|
|
|
const dng_image &srcImage,
|
|
|
dng_image &dstImage,
|
|
|
uint32 bitDepth,
|
|
|
real32 scale)
|
|
|
{
|
|
|
|
|
|
// Kludge: Turning this off for now because the AVX2 path produces
|
|
|
// slightly different results from the Scalar routine causing a mis-match
|
|
|
// in raw digest values when building HDR merge result negatives which
|
|
|
// causes the client to display a "file appears to be damaged" warning.
|
|
|
// -bury 11/13/2017
|
|
|
|
|
|
#if (qDNGIntelCompiler && qDNGExperimental && 0)
|
|
|
|
|
|
if (gDNGMaxSIMD >= AVX2)
|
|
|
{
|
|
|
|
|
|
LimitFloatBitDepth<AVX2> (host,
|
|
|
srcImage,
|
|
|
dstImage,
|
|
|
bitDepth,
|
|
|
scale);
|
|
|
|
|
|
}
|
|
|
|
|
|
else
|
|
|
|
|
|
#endif // qDNGIntelCompiler && qDNGExperimental
|
|
|
|
|
|
{
|
|
|
|
|
|
LimitFloatBitDepth<Scalar> (host,
|
|
|
srcImage,
|
|
|
dstImage,
|
|
|
bitDepth,
|
|
|
scale);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
/*****************************************************************************/
|