| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209 | 
							- From dbf03b37592b79d891cb796fda13d8495a6a5234 Mon Sep 17 00:00:00 2001
 
- From: Harm Hanemaaijer <[email protected]>
 
- Date: Thu, 20 Jun 2013 20:21:39 +0200
 
- Subject: [PATCH 057/232] Speed up console framebuffer imageblit function
 
- Especially on platforms with a slower CPU but a relatively high
 
- framebuffer fill bandwidth, like current ARM devices, the existing
 
- console monochrome imageblit function used to draw console text is
 
- suboptimal for common pixel depths such as 16bpp and 32bpp. The existing
 
- code is quite general and can deal with several pixel depths. By creating
 
- special case functions for 16bpp and 32bpp, by far the most common pixel
 
- formats used on modern systems, a significant speed-up is attained
 
- which can be readily felt on ARM-based devices like the Raspberry Pi
 
- and the Allwinner platform, but should help any platform using the
 
- fb layer.
 
- The special case functions allow constant folding, eliminating a number
 
- of instructions including divide operations, and allow the use of an
 
- unrolled loop, eliminating instructions with a variable shift size,
 
- reducing source memory access instructions, and eliminating excessive
 
- branching. These unrolled loops also allow much better code optimization
 
- by the C compiler. The code that selects which optimized variant is used
 
- is also simplified, eliminating integer divide instructions.
 
- The speed-up, measured by timing 'cat file.txt' in the console, varies
 
- between 40% and 70%, when testing on the Raspberry Pi and Allwinner
 
- ARM-based platforms, depending on font size and the pixel depth, with
 
- the greater benefit for 32bpp.
 
- Signed-off-by: Harm Hanemaaijer <[email protected]>
 
- ---
 
-  drivers/video/fbdev/core/cfbimgblt.c | 152 +++++++++++++++++++++++++++++++++--
 
-  1 file changed, 147 insertions(+), 5 deletions(-)
 
- --- a/drivers/video/fbdev/core/cfbimgblt.c
 
- +++ b/drivers/video/fbdev/core/cfbimgblt.c
 
- @@ -28,6 +28,11 @@
 
-   *
 
-   *  Also need to add code to deal with cards endians that are different than
 
-   *  the native cpu endians. I also need to deal with MSB position in the word.
 
- + *  Modified by Harm Hanemaaijer ([email protected]) 2013:
 
- + *  - Provide optimized versions of fast_imageblit for 16 and 32bpp that are
 
- + *    significantly faster than the previous implementation.
 
- + *  - Simplify the fast/slow_imageblit selection code, avoiding integer
 
- + *    divides.
 
-   */
 
-  #include <linux/module.h>
 
-  #include <linux/string.h>
 
- @@ -262,6 +267,133 @@ static inline void fast_imageblit(const
 
-  	}
 
-  }	
 
-  	
 
- +/*
 
- + * Optimized fast_imageblit for bpp == 16. ppw = 2, bit_mask = 3 folded
 
- + * into the code, main loop unrolled.
 
- + */
 
- +
 
- +static inline void fast_imageblit16(const struct fb_image *image,
 
- +				    struct fb_info *p, u8 __iomem * dst1,
 
- +				    u32 fgcolor, u32 bgcolor)
 
- +{
 
- +	u32 fgx = fgcolor, bgx = bgcolor;
 
- +	u32 spitch = (image->width + 7) / 8;
 
- +	u32 end_mask, eorx;
 
- +	const char *s = image->data, *src;
 
- +	u32 __iomem *dst;
 
- +	const u32 *tab = NULL;
 
- +	int i, j, k;
 
- +
 
- +	tab = fb_be_math(p) ? cfb_tab16_be : cfb_tab16_le;
 
- +
 
- +	fgx <<= 16;
 
- +	bgx <<= 16;
 
- +	fgx |= fgcolor;
 
- +	bgx |= bgcolor;
 
- +
 
- +	eorx = fgx ^ bgx;
 
- +	k = image->width / 2;
 
- +
 
- +	for (i = image->height; i--;) {
 
- +		dst = (u32 __iomem *) dst1;
 
- +		src = s;
 
- +
 
- +		j = k;
 
- +		while (j >= 4) {
 
- +			u8 bits = *src;
 
- +			end_mask = tab[(bits >> 6) & 3];
 
- +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 
- +			end_mask = tab[(bits >> 4) & 3];
 
- +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 
- +			end_mask = tab[(bits >> 2) & 3];
 
- +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 
- +			end_mask = tab[bits & 3];
 
- +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 
- +			src++;
 
- +			j -= 4;
 
- +		}
 
- +		if (j != 0) {
 
- +			u8 bits = *src;
 
- +			end_mask = tab[(bits >> 6) & 3];
 
- +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 
- +			if (j >= 2) {
 
- +				end_mask = tab[(bits >> 4) & 3];
 
- +				FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 
- +				if (j == 3) {
 
- +					end_mask = tab[(bits >> 2) & 3];
 
- +					FB_WRITEL((end_mask & eorx) ^ bgx, dst);
 
- +				}
 
- +			}
 
- +		}
 
- +		dst1 += p->fix.line_length;
 
- +		s += spitch;
 
- +	}
 
- +}
 
- +
 
- +/*
 
- + * Optimized fast_imageblit for bpp == 32. ppw = 1, bit_mask = 1 folded
 
- + * into the code, main loop unrolled.
 
- + */
 
- +
 
- +static inline void fast_imageblit32(const struct fb_image *image,
 
- +				    struct fb_info *p, u8 __iomem * dst1,
 
- +				    u32 fgcolor, u32 bgcolor)
 
- +{
 
- +	u32 fgx = fgcolor, bgx = bgcolor;
 
- +	u32 spitch = (image->width + 7) / 8;
 
- +	u32 end_mask, eorx;
 
- +	const char *s = image->data, *src;
 
- +	u32 __iomem *dst;
 
- +	const u32 *tab = NULL;
 
- +	int i, j, k;
 
- +
 
- +	tab = cfb_tab32;
 
- +
 
- +	eorx = fgx ^ bgx;
 
- +	k = image->width;
 
- +
 
- +	for (i = image->height; i--;) {
 
- +		dst = (u32 __iomem *) dst1;
 
- +		src = s;
 
- +
 
- +		j = k;
 
- +		while (j >= 8) {
 
- +			u8 bits = *src;
 
- +			end_mask = tab[(bits >> 7) & 1];
 
- +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 
- +			end_mask = tab[(bits >> 6) & 1];
 
- +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 
- +			end_mask = tab[(bits >> 5) & 1];
 
- +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 
- +			end_mask = tab[(bits >> 4) & 1];
 
- +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 
- +			end_mask = tab[(bits >> 3) & 1];
 
- +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 
- +			end_mask = tab[(bits >> 2) & 1];
 
- +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 
- +			end_mask = tab[(bits >> 1) & 1];
 
- +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 
- +			end_mask = tab[bits & 1];
 
- +			FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 
- +			src++;
 
- +			j -= 8;
 
- +		}
 
- +		if (j != 0) {
 
- +			u32 bits = (u32) * src;
 
- +			while (j > 1) {
 
- +				end_mask = tab[(bits >> 7) & 1];
 
- +				FB_WRITEL((end_mask & eorx) ^ bgx, dst++);
 
- +				bits <<= 1;
 
- +				j--;
 
- +			}
 
- +			end_mask = tab[(bits >> 7) & 1];
 
- +			FB_WRITEL((end_mask & eorx) ^ bgx, dst);
 
- +		}
 
- +		dst1 += p->fix.line_length;
 
- +		s += spitch;
 
- +	}
 
- +}
 
- +
 
-  void cfb_imageblit(struct fb_info *p, const struct fb_image *image)
 
-  {
 
-  	u32 fgcolor, bgcolor, start_index, bitstart, pitch_index = 0;
 
- @@ -294,11 +426,21 @@ void cfb_imageblit(struct fb_info *p, co
 
-  			bgcolor = image->bg_color;
 
-  		}	
 
-  		
 
- -		if (32 % bpp == 0 && !start_index && !pitch_index && 
 
- -		    ((width & (32/bpp-1)) == 0) &&
 
- -		    bpp >= 8 && bpp <= 32) 			
 
- -			fast_imageblit(image, p, dst1, fgcolor, bgcolor);
 
- -		else 
 
- +		if (!start_index && !pitch_index) {
 
- +			if (bpp == 32)
 
- +				fast_imageblit32(image, p, dst1, fgcolor,
 
- +						 bgcolor);
 
- +			else if (bpp == 16 && (width & 1) == 0)
 
- +				fast_imageblit16(image, p, dst1, fgcolor,
 
- +						 bgcolor);
 
- +			else if (bpp == 8 && (width & 3) == 0)
 
- +				fast_imageblit(image, p, dst1, fgcolor,
 
- +					       bgcolor);
 
- +			else
 
- +				slow_imageblit(image, p, dst1, fgcolor,
 
- +					       bgcolor,
 
- +					       start_index, pitch_index);
 
- +		} else
 
-  			slow_imageblit(image, p, dst1, fgcolor, bgcolor,
 
-  					start_index, pitch_index);
 
-  	} else
 
 
  |