You are right, your right and left solution was elegant and runs fast.
What I did was take a snippet of your code and adapt it to what I had done, the gain is minimal to say the least. And there are differences between machines, so the measurements can be different.
Some results to "rept 10000":
your rot
Processor 0
Clock Core cyc Instruct Uops BrTaken
164772 162016 210012 243834 30000
140896 152612 210001 240022 30000
140874 152643 210001 240013 30000
140954 152642 210001 240013 30000
align 16
rot3 proc
mov ecx,[counter]
movdqu xmm0,oword ptr [number]
.if ecx >= 64
pshufd xmm0,xmm0,01001110b
sub ecx,64
.endif
movdqu xmm1,xmm0
movd xmm3,ecx
psllq xmm0,xmm3
sub ecx,64
neg ecx
movd xmm4,ecx
psrlq xmm1,xmm4
pshufd xmm1,xmm1,01001110b
por xmm0,xmm1
ret
rot3 endp
Processor 0
Clock Core cyc Instruct Uops BrTaken
180560 174677 190012 233851 20000
135620 146905 190001 230022 20000
135610 146932 190001 230019 20000
135648 146918 190001 230019 20000
align 16
rot6 proc
mov ecx,counter
mov rax,[number+8]
mov rdx,[number]
.if ecx >= 64
mov rbx,rax
mov rax,rdx
mov rdx,rbx
sub ecx,64
.endif
xor ebx,ebx
shld rbx,rax,cl
shld rax,rdx,cl
shl rdx,cl
or rdx,rbx
ret
rot6 endp
Processor 0
Clock Core cyc Instruct Uops BrTaken
148766 144041 180012 283841 20000
122292 132467 180001 280025 20000
122310 132504 180001 280019 20000
122304 132506 180001 280020 20000