Submitted By: William Harrington Date: 2008-08-24 Initial Package Version: 1.5 Upstream Status: Rejected Description: Use a local memcpy instead of the glibc implementation for ping This resolves the bus errors diff -Naur inetutils-1.5.orig/ping/Makefile.in inetutils-1.5/ping/Makefile.in --- inetutils-1.5.orig/ping/Makefile.in 2006-10-21 11:59:48.000000000 +0000 +++ inetutils-1.5/ping/Makefile.in 2007-06-28 23:10:43.000000000 +0000 @@ -109,7 +109,8 @@ PROGRAMS = $(bin_PROGRAMS) am_ping_OBJECTS = ping.$(OBJEXT) ping_common.$(OBJEXT) \ ping_echo.$(OBJEXT) ping_address.$(OBJEXT) \ - ping_router.$(OBJEXT) ping_timestamp.$(OBJEXT) + ping_router.$(OBJEXT) ping_timestamp.$(OBJEXT) \ + wordcopy.$(OBJEXT) ping_OBJECTS = $(am_ping_OBJECTS) ping_DEPENDENCIES = am_ping6_OBJECTS = ping6.$(OBJEXT) ping_common.$(OBJEXT) @@ -333,8 +334,9 @@ ping_LDADD = -L../libinetutils -linetutils -L../libicmp -licmp -L../lib -lgnu ping6_LDADD = -L../lib -lgnu -L../libinetutils -linetutils INCLUDES = -I$(top_srcdir)/lib -I../lib -I$(top_srcdir)/libicmp -ping_SOURCES = ping.c ping_common.c ping_echo.c ping_address.c \ - ping_router.c ping_timestamp.c ping_common.h ping_impl.h +ping_SOURCES = memcopy.h pagecopy.h ping.c ping_common.c \ + ping_echo.c ping_address.c ping_router.c ping_timestamp.c \ + ping_common.h ping_impl.h wordcopy.c ping6_SOURCES = ping6.c ping_common.c ping_common.h ping6.h SUIDMODE = -o root -m 4775 diff -Naur inetutils-1.5.orig/ping/memcopy.h inetutils-1.5/ping/memcopy.h --- inetutils-1.5.orig/ping/memcopy.h 1970-01-01 00:00:00.000000000 +0000 +++ inetutils-1.5/ping/memcopy.h 2007-06-28 23:08:34.000000000 +0000 @@ -0,0 +1,150 @@ +/* memcopy.h -- definitions for memory copy functions. Generic C version. + Copyright (C) 1991, 1992, 1993, 1997, 2004 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Torbjorn Granlund (tege@sics.se). + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +/* The strategy of the memory functions is: + + 1. Copy bytes until the destination pointer is aligned. + + 2. Copy words in unrolled loops. If the source and destination + are not aligned in the same way, use word memory operations, + but shift and merge two read words before writing. + + 3. Copy the few remaining bytes. + + This is fast on processors that have at least 10 registers for + allocation by GCC, and that can access memory at reg+const in one + instruction. + + I made an "exhaustive" test of this memmove when I wrote it, + exhaustive in the sense that I tried all alignment and length + combinations, with and without overlap. */ + +#include +#include + +/* The macros defined in this file are: + + BYTE_COPY_FWD(dst_beg_ptr, src_beg_ptr, nbytes_to_copy) + + BYTE_COPY_BWD(dst_end_ptr, src_end_ptr, nbytes_to_copy) + + WORD_COPY_FWD(dst_beg_ptr, src_beg_ptr, nbytes_remaining, nbytes_to_copy) + + WORD_COPY_BWD(dst_end_ptr, src_end_ptr, nbytes_remaining, nbytes_to_copy) + + MERGE(old_word, sh_1, new_word, sh_2) + [I fail to understand. I feel stupid. --roland] +*/ + +/* Type to use for aligned memory operations. + This should normally be the biggest type supported by a single load + and store. */ +#define op_t unsigned long int +#define OPSIZ (sizeof(op_t)) + +/* Type to use for unaligned operations. */ +typedef unsigned char byte; + +/* Optimal type for storing bytes in registers. */ +#define reg_char char + +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define MERGE(w0, sh_1, w1, sh_2) (((w0) >> (sh_1)) | ((w1) << (sh_2))) +#endif +#if __BYTE_ORDER == __BIG_ENDIAN +#define MERGE(w0, sh_1, w1, sh_2) (((w0) << (sh_1)) | ((w1) >> (sh_2))) +#endif + +/* Copy exactly NBYTES bytes from SRC_BP to DST_BP, + without any assumptions about alignment of the pointers. */ +#define BYTE_COPY_FWD(dst_bp, src_bp, nbytes) \ + do \ + { \ + size_t __nbytes = (nbytes); \ + while (__nbytes > 0) \ + { \ + byte __x = ((byte *) src_bp)[0]; \ + src_bp += 1; \ + __nbytes -= 1; \ + ((byte *) dst_bp)[0] = __x; \ + dst_bp += 1; \ + } \ + } while (0) + +/* Copy exactly NBYTES_TO_COPY bytes from SRC_END_PTR to DST_END_PTR, + beginning at the bytes right before the pointers and continuing towards + smaller addresses. Don't assume anything about alignment of the + pointers. */ +#define BYTE_COPY_BWD(dst_ep, src_ep, nbytes) \ + do \ + { \ + size_t __nbytes = (nbytes); \ + while (__nbytes > 0) \ + { \ + byte __x; \ + src_ep -= 1; \ + __x = ((byte *) src_ep)[0]; \ + dst_ep -= 1; \ + __nbytes -= 1; \ + ((byte *) dst_ep)[0] = __x; \ + } \ + } while (0) + +/* Copy *up to* NBYTES bytes from SRC_BP to DST_BP, with + the assumption that DST_BP is aligned on an OPSIZ multiple. If + not all bytes could be easily copied, store remaining number of bytes + in NBYTES_LEFT, otherwise store 0. */ +extern void _wordcopy_fwd_aligned (long int, long int, size_t) __THROW; +extern void _wordcopy_fwd_dest_aligned (long int, long int, size_t) __THROW; +#define WORD_COPY_FWD(dst_bp, src_bp, nbytes_left, nbytes) \ + do \ + { \ + if (src_bp % OPSIZ == 0) \ + _wordcopy_fwd_aligned (dst_bp, src_bp, (nbytes) / OPSIZ); \ + else \ + _wordcopy_fwd_dest_aligned (dst_bp, src_bp, (nbytes) / OPSIZ); \ + src_bp += (nbytes) & -OPSIZ; \ + dst_bp += (nbytes) & -OPSIZ; \ + (nbytes_left) = (nbytes) % OPSIZ; \ + } while (0) + +/* Copy *up to* NBYTES_TO_COPY bytes from SRC_END_PTR to DST_END_PTR, + beginning at the words (of type op_t) right before the pointers and + continuing towards smaller addresses. May take advantage of that + DST_END_PTR is aligned on an OPSIZ multiple. If not all bytes could be + easily copied, store remaining number of bytes in NBYTES_REMAINING, + otherwise store 0. */ +extern void _wordcopy_bwd_aligned (long int, long int, size_t) __THROW; +extern void _wordcopy_bwd_dest_aligned (long int, long int, size_t) __THROW; +#define WORD_COPY_BWD(dst_ep, src_ep, nbytes_left, nbytes) \ + do \ + { \ + if (src_ep % OPSIZ == 0) \ + _wordcopy_bwd_aligned (dst_ep, src_ep, (nbytes) / OPSIZ); \ + else \ + _wordcopy_bwd_dest_aligned (dst_ep, src_ep, (nbytes) / OPSIZ); \ + src_ep -= (nbytes) & -OPSIZ; \ + dst_ep -= (nbytes) & -OPSIZ; \ + (nbytes_left) = (nbytes) % OPSIZ; \ + } while (0) + + +/* Threshold value for when to enter the unrolled loops. */ +#define OP_T_THRES 16 diff -Naur inetutils-1.5.orig/ping/pagecopy.h inetutils-1.5/ping/pagecopy.h --- inetutils-1.5.orig/ping/pagecopy.h 1970-01-01 00:00:00.000000000 +0000 +++ inetutils-1.5/ping/pagecopy.h 2007-06-28 23:08:34.000000000 +0000 @@ -0,0 +1,75 @@ +/* Macros for copying by pages; used in memcpy, memmove. Generic macros. + Copyright (C) 1995, 1997 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +/* This file defines the macro: + + PAGE_COPY_FWD_MAYBE (dstp, srcp, nbytes_left, nbytes) + + which is invoked like WORD_COPY_FWD et al. The pointers should be at + least word aligned. This will check if virtual copying by pages can and + should be done and do it if so. + + System-specific pagecopy.h files should define these macros and then + #include this file: + + PAGE_COPY_THRESHOLD + -- Minimum size for which virtual copying by pages is worthwhile. + + PAGE_SIZE + -- Size of a page. + + PAGE_COPY_FWD (dstp, srcp, nbytes_left, nbytes) + -- Macro to perform the virtual copy operation. + The pointers will be aligned to PAGE_SIZE bytes. +*/ + + +#if PAGE_COPY_THRESHOLD + +#include + +#define PAGE_COPY_FWD_MAYBE(dstp, srcp, nbytes_left, nbytes) \ + do \ + { \ + if ((nbytes) >= PAGE_COPY_THRESHOLD && \ + PAGE_OFFSET ((dstp) - (srcp)) == 0) \ + { \ + /* The amount to copy is past the threshold for copying \ + pages virtually with kernel VM operations, and the \ + source and destination addresses have the same alignment. */ \ + size_t nbytes_before = PAGE_OFFSET (-(dstp)); \ + if (nbytes_before != 0) \ + { \ + /* First copy the words before the first page boundary. */ \ + WORD_COPY_FWD (dstp, srcp, nbytes_left, nbytes_before); \ + assert (nbytes_left == 0); \ + nbytes -= nbytes_before; \ + } \ + PAGE_COPY_FWD (dstp, srcp, nbytes_left, nbytes); \ + } \ + } while (0) + +/* The page size is always a power of two, so we can avoid modulo division. */ +#define PAGE_OFFSET(n) ((n) & (PAGE_SIZE - 1)) + +#else + +#define PAGE_COPY_FWD_MAYBE(dstp, srcp, nbytes_left, nbytes) /* nada */ + +#endif diff -Naur inetutils-1.5.orig/ping/ping_echo.c inetutils-1.5/ping/ping_echo.c --- inetutils-1.5.orig/ping/ping_echo.c 2006-10-11 21:46:25.000000000 +0000 +++ inetutils-1.5/ping/ping_echo.c 2007-06-28 23:08:34.000000000 +0000 @@ -57,6 +57,11 @@ #include +#ifndef memcpy +#include "memcopy.h" +#include "pagecopy.h" +#endif + static int handler (int code, void *closure, struct sockaddr_in *dest, struct sockaddr_in *from, struct ip *ip, icmphdr_t *icmp, int datalen); @@ -68,7 +73,6 @@ void print_icmp_header (struct sockaddr_in *from, struct ip *ip, icmphdr_t *icmp, int len); static void print_ip_opt (struct ip *ip, int hlen); - int ping_echo (int argc, char **argv) { @@ -169,6 +173,46 @@ timing++; tp = (struct timeval *) icmp->icmp_data; +#define OP_T_THRES 16 +#undef memcpy +void * + memcpy (dstpp, srcpp, len) + void *dstpp; +const void *srcpp; +size_t len; +{ + unsigned long int dstp = (long int) dstpp; + unsigned long int srcp = (long int) srcpp; + + /* Copy from the beginning to the end. */ + + /* If there not too few bytes to copy, use word copy. */ + if (len >= OP_T_THRES) + { + /* Copy just a few bytes to make DSTP aligned. */ + len -= (-dstp) % OPSIZ; + BYTE_COPY_FWD (dstp, srcp, (-dstp) % OPSIZ); + + /* Copy whole pages from SRCP to DSTP by virtual address + manipulation, as much as possible. */ + + PAGE_COPY_FWD_MAYBE (dstp, srcp, len, len); + + /* Copy from SRCP to DSTP taking advantage of the known alignment + of DSTP. Number of bytes remaining is put in the third argument, + i.e. in LEN. This number may vary from machine to machine. */ + + WORD_COPY_FWD (dstp, srcp, len, len); + + /* Fall out and copy the tail. */ + } + + /* There are just a few bytes to copy. Use byte memory operations. */ + BYTE_COPY_FWD (dstp, srcp, len); + + return dstpp; +} + /* Avoid unaligned data: */ memcpy (&tv1, tp, sizeof (tv1)); tvsub (&tv, &tv1); diff -Naur inetutils-1.5.orig/ping/wordcopy.c inetutils-1.5/ping/wordcopy.c --- inetutils-1.5.orig/ping/wordcopy.c 1970-01-01 00:00:00.000000000 +0000 +++ inetutils-1.5/ping/wordcopy.c 2007-06-28 23:00:04.000000000 +0000 @@ -0,0 +1,413 @@ +/* _memcopy.c -- subroutines for memory copy functions. + Copyright (C) 1991, 1996 Free Software Foundation, Inc. + This file is part of the GNU C Library. + Contributed by Torbjorn Granlund (tege@sics.se). + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +/* BE VERY CAREFUL IF YOU CHANGE THIS CODE...! */ + +#include +#include + +/* _wordcopy_fwd_aligned -- Copy block beginning at SRCP to + block beginning at DSTP with LEN `op_t' words (not LEN bytes!). + Both SRCP and DSTP should be aligned for memory operations on `op_t's. */ + +void +_wordcopy_fwd_aligned (dstp, srcp, len) + long int dstp; + long int srcp; + size_t len; +{ + op_t a0, a1; + + switch (len % 8) + { + case 2: + a0 = ((op_t *) srcp)[0]; + srcp -= 6 * OPSIZ; + dstp -= 7 * OPSIZ; + len += 6; + goto do1; + case 3: + a1 = ((op_t *) srcp)[0]; + srcp -= 5 * OPSIZ; + dstp -= 6 * OPSIZ; + len += 5; + goto do2; + case 4: + a0 = ((op_t *) srcp)[0]; + srcp -= 4 * OPSIZ; + dstp -= 5 * OPSIZ; + len += 4; + goto do3; + case 5: + a1 = ((op_t *) srcp)[0]; + srcp -= 3 * OPSIZ; + dstp -= 4 * OPSIZ; + len += 3; + goto do4; + case 6: + a0 = ((op_t *) srcp)[0]; + srcp -= 2 * OPSIZ; + dstp -= 3 * OPSIZ; + len += 2; + goto do5; + case 7: + a1 = ((op_t *) srcp)[0]; + srcp -= 1 * OPSIZ; + dstp -= 2 * OPSIZ; + len += 1; + goto do6; + + case 0: + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + return; + a0 = ((op_t *) srcp)[0]; + srcp -= 0 * OPSIZ; + dstp -= 1 * OPSIZ; + goto do7; + case 1: + a1 = ((op_t *) srcp)[0]; + srcp -=-1 * OPSIZ; + dstp -= 0 * OPSIZ; + len -= 1; + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + goto do0; + goto do8; /* No-op. */ + } + + do + { + do8: + a0 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = a1; + do7: + a1 = ((op_t *) srcp)[1]; + ((op_t *) dstp)[1] = a0; + do6: + a0 = ((op_t *) srcp)[2]; + ((op_t *) dstp)[2] = a1; + do5: + a1 = ((op_t *) srcp)[3]; + ((op_t *) dstp)[3] = a0; + do4: + a0 = ((op_t *) srcp)[4]; + ((op_t *) dstp)[4] = a1; + do3: + a1 = ((op_t *) srcp)[5]; + ((op_t *) dstp)[5] = a0; + do2: + a0 = ((op_t *) srcp)[6]; + ((op_t *) dstp)[6] = a1; + do1: + a1 = ((op_t *) srcp)[7]; + ((op_t *) dstp)[7] = a0; + + srcp += 8 * OPSIZ; + dstp += 8 * OPSIZ; + len -= 8; + } + while (len != 0); + + /* This is the right position for do0. Please don't move + it into the loop. */ + do0: + ((op_t *) dstp)[0] = a1; +} + +/* _wordcopy_fwd_dest_aligned -- Copy block beginning at SRCP to + block beginning at DSTP with LEN `op_t' words (not LEN bytes!). + DSTP should be aligned for memory operations on `op_t's, but SRCP must + *not* be aligned. */ + +void +_wordcopy_fwd_dest_aligned (dstp, srcp, len) + long int dstp; + long int srcp; + size_t len; +{ + op_t a0, a1, a2, a3; + int sh_1, sh_2; + + /* Calculate how to shift a word read at the memory operation + aligned srcp to make it aligned for copy. */ + + sh_1 = 8 * (srcp % OPSIZ); + sh_2 = 8 * OPSIZ - sh_1; + + /* Make SRCP aligned by rounding it down to the beginning of the `op_t' + it points in the middle of. */ + srcp &= -OPSIZ; + + switch (len % 4) + { + case 2: + a1 = ((op_t *) srcp)[0]; + a2 = ((op_t *) srcp)[1]; + srcp -= 1 * OPSIZ; + dstp -= 3 * OPSIZ; + len += 2; + goto do1; + case 3: + a0 = ((op_t *) srcp)[0]; + a1 = ((op_t *) srcp)[1]; + srcp -= 0 * OPSIZ; + dstp -= 2 * OPSIZ; + len += 1; + goto do2; + case 0: + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + return; + a3 = ((op_t *) srcp)[0]; + a0 = ((op_t *) srcp)[1]; + srcp -=-1 * OPSIZ; + dstp -= 1 * OPSIZ; + len += 0; + goto do3; + case 1: + a2 = ((op_t *) srcp)[0]; + a3 = ((op_t *) srcp)[1]; + srcp -=-2 * OPSIZ; + dstp -= 0 * OPSIZ; + len -= 1; + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + goto do0; + goto do4; /* No-op. */ + } + + do + { + do4: + a0 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2); + do3: + a1 = ((op_t *) srcp)[1]; + ((op_t *) dstp)[1] = MERGE (a3, sh_1, a0, sh_2); + do2: + a2 = ((op_t *) srcp)[2]; + ((op_t *) dstp)[2] = MERGE (a0, sh_1, a1, sh_2); + do1: + a3 = ((op_t *) srcp)[3]; + ((op_t *) dstp)[3] = MERGE (a1, sh_1, a2, sh_2); + + srcp += 4 * OPSIZ; + dstp += 4 * OPSIZ; + len -= 4; + } + while (len != 0); + + /* This is the right position for do0. Please don't move + it into the loop. */ + do0: + ((op_t *) dstp)[0] = MERGE (a2, sh_1, a3, sh_2); +} + +/* _wordcopy_bwd_aligned -- Copy block finishing right before + SRCP to block finishing right before DSTP with LEN `op_t' words + (not LEN bytes!). Both SRCP and DSTP should be aligned for memory + operations on `op_t's. */ + +void +_wordcopy_bwd_aligned (dstp, srcp, len) + long int dstp; + long int srcp; + size_t len; +{ + op_t a0, a1; + + switch (len % 8) + { + case 2: + srcp -= 2 * OPSIZ; + dstp -= 1 * OPSIZ; + a0 = ((op_t *) srcp)[1]; + len += 6; + goto do1; + case 3: + srcp -= 3 * OPSIZ; + dstp -= 2 * OPSIZ; + a1 = ((op_t *) srcp)[2]; + len += 5; + goto do2; + case 4: + srcp -= 4 * OPSIZ; + dstp -= 3 * OPSIZ; + a0 = ((op_t *) srcp)[3]; + len += 4; + goto do3; + case 5: + srcp -= 5 * OPSIZ; + dstp -= 4 * OPSIZ; + a1 = ((op_t *) srcp)[4]; + len += 3; + goto do4; + case 6: + srcp -= 6 * OPSIZ; + dstp -= 5 * OPSIZ; + a0 = ((op_t *) srcp)[5]; + len += 2; + goto do5; + case 7: + srcp -= 7 * OPSIZ; + dstp -= 6 * OPSIZ; + a1 = ((op_t *) srcp)[6]; + len += 1; + goto do6; + + case 0: + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + return; + srcp -= 8 * OPSIZ; + dstp -= 7 * OPSIZ; + a0 = ((op_t *) srcp)[7]; + goto do7; + case 1: + srcp -= 9 * OPSIZ; + dstp -= 8 * OPSIZ; + a1 = ((op_t *) srcp)[8]; + len -= 1; + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + goto do0; + goto do8; /* No-op. */ + } + + do + { + do8: + a0 = ((op_t *) srcp)[7]; + ((op_t *) dstp)[7] = a1; + do7: + a1 = ((op_t *) srcp)[6]; + ((op_t *) dstp)[6] = a0; + do6: + a0 = ((op_t *) srcp)[5]; + ((op_t *) dstp)[5] = a1; + do5: + a1 = ((op_t *) srcp)[4]; + ((op_t *) dstp)[4] = a0; + do4: + a0 = ((op_t *) srcp)[3]; + ((op_t *) dstp)[3] = a1; + do3: + a1 = ((op_t *) srcp)[2]; + ((op_t *) dstp)[2] = a0; + do2: + a0 = ((op_t *) srcp)[1]; + ((op_t *) dstp)[1] = a1; + do1: + a1 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = a0; + + srcp -= 8 * OPSIZ; + dstp -= 8 * OPSIZ; + len -= 8; + } + while (len != 0); + + /* This is the right position for do0. Please don't move + it into the loop. */ + do0: + ((op_t *) dstp)[7] = a1; +} + +/* _wordcopy_bwd_dest_aligned -- Copy block finishing right + before SRCP to block finishing right before DSTP with LEN `op_t' + words (not LEN bytes!). DSTP should be aligned for memory + operations on `op_t', but SRCP must *not* be aligned. */ + +void +_wordcopy_bwd_dest_aligned (dstp, srcp, len) + long int dstp; + long int srcp; + size_t len; +{ + op_t a0, a1, a2, a3; + int sh_1, sh_2; + + /* Calculate how to shift a word read at the memory operation + aligned srcp to make it aligned for copy. */ + + sh_1 = 8 * (srcp % OPSIZ); + sh_2 = 8 * OPSIZ - sh_1; + + /* Make srcp aligned by rounding it down to the beginning of the op_t + it points in the middle of. */ + srcp &= -OPSIZ; + srcp += OPSIZ; + + switch (len % 4) + { + case 2: + srcp -= 3 * OPSIZ; + dstp -= 1 * OPSIZ; + a2 = ((op_t *) srcp)[2]; + a1 = ((op_t *) srcp)[1]; + len += 2; + goto do1; + case 3: + srcp -= 4 * OPSIZ; + dstp -= 2 * OPSIZ; + a3 = ((op_t *) srcp)[3]; + a2 = ((op_t *) srcp)[2]; + len += 1; + goto do2; + case 0: + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + return; + srcp -= 5 * OPSIZ; + dstp -= 3 * OPSIZ; + a0 = ((op_t *) srcp)[4]; + a3 = ((op_t *) srcp)[3]; + goto do3; + case 1: + srcp -= 6 * OPSIZ; + dstp -= 4 * OPSIZ; + a1 = ((op_t *) srcp)[5]; + a0 = ((op_t *) srcp)[4]; + len -= 1; + if (OP_T_THRES <= 3 * OPSIZ && len == 0) + goto do0; + goto do4; /* No-op. */ + } + + do + { + do4: + a3 = ((op_t *) srcp)[3]; + ((op_t *) dstp)[3] = MERGE (a0, sh_1, a1, sh_2); + do3: + a2 = ((op_t *) srcp)[2]; + ((op_t *) dstp)[2] = MERGE (a3, sh_1, a0, sh_2); + do2: + a1 = ((op_t *) srcp)[1]; + ((op_t *) dstp)[1] = MERGE (a2, sh_1, a3, sh_2); + do1: + a0 = ((op_t *) srcp)[0]; + ((op_t *) dstp)[0] = MERGE (a1, sh_1, a2, sh_2); + + srcp -= 4 * OPSIZ; + dstp -= 4 * OPSIZ; + len -= 4; + } + while (len != 0); + + /* This is the right position for do0. Please don't move + it into the loop. */ + do0: + ((op_t *) dstp)[3] = MERGE (a0, sh_1, a1, sh_2); +}