Pulling hairs out of my head... :(
I'm trying to get the html content of google search and load it in to memory.
Then i can search for the image urls and show those images in my program.
I've written a routine to connect with the google server and send a search request.
But it redirects (me to the https page and i don't know how to connect to that page.
HTTP/1.1 302 Found
Location: https://www.google.com/search?q=rammstein&tbm=isch
Cache-Control: private
Content-Type: text/html; charset=UTF-8
Set-Cookie: PREF=ID=2580d3f61611f780:FF=0:TM=1385822026:LM=1385822026:S=bScjZ7XszAVzBc_T; expires=Mon, 30-Nov-2015 14:33:46 GMT; path=/; domain=.google.com
Set-Cookie: NID=67=PS3svhZFPKcHfZgV1sfXdsBY4nSYd3cGCQgn6JbVrVc-7XmghKnx5NPmqSlsJ2Ib0MZ--IhhpCuxwbdCTtC2hiOWM4GLLwdM_qrIotBmHkzRs08GZvP0sHWMs9ExZP8j; expires=Sun, 01-Jun-2014 14:33:46 GMT; path=/; domain=.google.com; HttpOnly
P3P: CP="This is not a P3P policy! See http://www.google.com/support/accounts/bin/answer.py?hl=en&answer=151657 for more info."
Date: Sat, 30 Nov 2013 14:33:46 GMT
Server: gws
Content-Length: 251
X-XSS-Protection: 1; mode=block
X-Frame-Options: SAMEORIGIN
Alternate-Protocol: 80:quic
Connection: close
<HTML><HEAD><meta http-equiv="content-type" content="text/html;charset=utf-8">
<TITLE>302 Moved</TITLE></HEAD><BODY>
I found out that https goes via port 443 but it doesn't work.
Don't know how to make the request header...........
I can save the html content file with the "URLDownloadToFile" api but that's not what i want.
There must be a way to do it with my routine.
.486
.model flat,stdcall
option casemap:none
include windows.inc
include user32.inc
includelib user32.lib
include kernel32.inc
includelib kernel32.lib
include wsock32.inc
includelib wsock32.lib
include urlmon.inc
includelib urlmon.lib
include Console.Inc
;https://www.google.com/search?q=rammstein&tbm=isch
GetImages db "GET /search?q=rammstein&tbm=isch HTTP/1.1",13,10
db "User-Agent: Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko",13,10
; db "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; nl; rv:1.8.0.3)",13,10
db "Host: www.google.com",13,10
db "Accept: */*",13,10
db "Connection: close",13,10
db 13,10,0
.data?
align 4
Internet_Buffer db 500*1024 dup (?)
szString_buffer db 128 dup (?)
.code
InternetServer proc uses edi Port:dword,Internet_page:dword,Internet_String:dword
LOCAL Internet_Socket,Bytes_received:dword
LOCAL wsaData:WSADATA
LOCAL Socket_adres:sockaddr_in
lea edi,Internet_Buffer
xor eax,eax
mov ecx,sizeof Internet_Buffer / 4
cld
rep stosd
mov Internet_Socket,0
invoke WSAStartup,0202h,addr wsaData
test eax,eax
jz winsock_open
ret
winsock_open:
invoke socket,AF_INET,SOCK_STREAM,IPPROTO_TCP
cmp eax,INVALID_SOCKET
jne socket_ok
jmp Sluit_socket
socket_ok:
mov Internet_Socket,eax
mov Socket_adres.sin_family,AF_INET
invoke htons,Port
mov Socket_adres.sin_port,ax
invoke gethostbyname,Internet_page
test eax,eax
jnz server_gevonden
jmp Sluit_socket
server_gevonden:
mov eax,[eax+12]
mov eax,[eax]
mov eax,[eax]
inet_adres_ok:
mov Socket_adres.sin_addr,eax
invoke connect,Internet_Socket,addr Socket_adres,sizeof Socket_adres
cmp eax,SOCKET_ERROR
jne contact_met_host
jmp Sluit_socket
contact_met_host:
invoke Print,TEXT_(13,10," Connected to Google server.",13,10,13,10)
invoke lstrlen,Internet_String
invoke send,Internet_Socket,Internet_String,eax,0
mov Bytes_received,0
Lees_socket:
lea eax,Internet_Buffer
add eax,Bytes_received
invoke recv,Internet_Socket,eax,256,0
cmp eax,SOCKET_ERROR
jne Antwoord_ok
jmp Antwoord_klaar
Antwoord_ok:
test eax,eax
jz Antwoord_klaar
add Bytes_received,eax
cmp Bytes_received,sizeof Internet_Buffer-256
jae Antwoord_klaar
jmp Lees_socket
Antwoord_klaar:
mov eax,Bytes_received
lea edx,Internet_Buffer
mov byte ptr[edx+eax-1],0
invoke Print,addr Internet_Buffer
invoke wsprintf,addr szString_buffer,TEXT_(13,10,13,10,"We are done: %d bytes received....",13,10),Bytes_received
invoke Print,addr szString_buffer
Sluit_socket:
cmp Internet_Socket,0
jz Sluit_socket_ok
invoke closesocket,Internet_Socket
mov Internet_Socket,0
Sluit_socket_ok:
invoke WSACleanup
ret
InternetServer endp
start:
invoke Print,TEXT_("Internet server",13,10)
invoke InternetServer,80,TEXT_("google.com"),addr GetImages
; this is what i want in memory with my server code, just what URLDownloadToFile saves to disk....
; invoke URLDownloadToFile,0,TEXT_("https://www.google.com/search?q=rammstein&tbm=isch"),TEXT_("google_html.txt"),0,0
invoke Wait_Key
invoke ExitProcess,0
end start
Hello,
I think you have to manage SSL/TSL by yourself or you can use the Winsock Secure Socket Extensions. Thats my ideas.
do you know this http://msdn.microsoft.com/en-us/library/windows/desktop/ms740139%28v=vs.85%29.aspx (http://msdn.microsoft.com/en-us/library/windows/desktop/ms740139%28v=vs.85%29.aspx)?
c++ sample: http://msdn.microsoft.com/en-us/library/windows/desktop/bb394814%28v=vs.85%29.aspx (http://msdn.microsoft.com/en-us/library/windows/desktop/bb394814%28v=vs.85%29.aspx)
Thank you for the hint traphunter. I'm going to try the secure socket.
Hi Marinus :t
Try this code:
include \masm32\include\masm32rt.inc
include \masm32\include\wininet.inc
includelib \masm32\lib\wininet.lib
.686
.mmx
.xmm
.data
.code
start proc
LOCAL hio:DWORD
LOCAL hic:DWORD
LOCAL hir:DWORD
LOCAL tdd:DWORD
LOCAL buf[129]:BYTE
invoke InternetOpen,CTXT("ASM example"),0,0,0,0
mov hio,eax
invoke InternetConnect,eax,CTXT("www.google.com"),443,0,0,INTERNET_SERVICE_HTTP,0,0
mov hic,eax
invoke HttpOpenRequest,eax,CTXT("GET"),CTXT("/search?q=rammstein&tbm=isch"),0,0,0,INTERNET_FLAG_NO_CACHE_WRITE or INTERNET_FLAG_SECURE,0
mov hir,eax
invoke HttpSendRequest,eax,0,0,0,0
@@:
invoke InternetReadFile,hir,addr buf,sizeof buf-1,addr tdd
test eax,eax
jz @F
mov eax,tdd
test eax,eax
jz @F
mov byte ptr [buf+eax],0
invoke crt_printf,CTXT("%s"),addr buf
jmp @B
@@:
invoke InternetCloseHandle,hir
invoke InternetCloseHandle,hic
invoke InternetCloseHandle,hio
invoke crt__getch
invoke crt_exit,0
start endp
end start
This code will connect to the HTTPS server, if you want to connect to HTTP server, then just change the blue from 443 (port) to 80, and remove red part. The specific of this function is that if it will be redirected from HTTP to HTTPS, it will silently go to the redirection and will get the data you need from the secure HTTP server.
I'm not very experienced with these functions, so you may find something interesting you need, I just give an idea, if it is suitable for you :t
it dumps a rather complex HTML to the console, here, Alex :t
built with no errors,
running XP SP3
Thank you for test, Dave! :biggrin:
Thank you Antariy,
I'm not very experienced with those functions too, already tried those wininet api functions
and can't get it to save the whole page, which should be +/- 480.000 bytes long.
It only saves about the first 37.000 bytes of the page and all the image urls are not included in that part.
I'm still trying the AcceptTypes and different flags for HttpOpenRequest to get the whole page.
No succes so far..... but keep on trying.
If you use "Mozilla/5.0" instead of "ASM example" it saves 71.000 bytes. ??????
include \masm32\MasmBasic\MasmBasic.inc ; download (http://masm32.com/board/index.php?topic=94.0)
Init
FileWrite "Ramstein.html", FileRead$("http://www.google.com/search?q=rammstein&tbm=isch")
Inkey "ok?"
Exit
end start
68k, but it looks complete (see attachment - the page ends with "Privacy & Terms About Google", and there are many images). What's missing, and what's wrong with URLDownloadToFile?
when it dumps to my console, the text ends with </HTML>
now, you just have to parse through all that to get the URLs for each pic/page ? (another HTML) :P
Hi jj2007,
There is nothing wrong with URLDownloadToFile, but i don't want to write a file to disk and then read it back to memory.
Your Ramstein.html has no urls to the images found by google search.
Look for imgurl=http://xxxxxxxxx.jpg
But i figured it all out, and now i can load the complete html result from google to memory.
Finaly i can search for images and show them in my program.
Tommorrow i'll clean up my code and post it, must go to bed now.
Here's a test, Pictures? means 0 = no images found, 1 = yeahhhhh images found
Hi Dave,
That's exactly the reason why i needed the complete content and get the image urls.
Quote from: dedndave on December 03, 2013, 01:29:05 PM
now, you just have to parse through all that to get the URLs for each pic/page ? (another HTML) :P
Apparently, that file contains only URLs to thumbnails like this one:
http://t3.gstatic.com/images?q=tbn:ANd9GcTj_a-RuWed7-ZC9ab-vmS_98FFRU6Eye1qvMSiixpVsk1g0CbSYyXiXsGv
(http://t3.gstatic.com/images?q=tbn:ANd9GcTj_a-RuWed7-ZC9ab-vmS_98FFRU6Eye1qvMSiixpVsk1g0CbSYyXiXsGv)
i think there is also a link (or script)
the problem is that the link may be encoded
and - unless you can emulate the PHP file that is on the server - you can't break the code
now - maybe it's not that difficult
but, i doubt google wants just anyone to have the power of google without sticking a google logo and marketing data collector on there :P
the guy didn't make 35 billion by being stupid
Actually, it's not that difficult to extract the image locations. I had thought Google would protect its servers*) but nope, they load just fine. You can even grab the images, see second attachment.
include \masm32\MasmBasic\MasmBasic.inc ; download (http://masm32.com/board/index.php?topic=94.0)
Init ; uses Extract$() (http://www.webalice.it/jj2006/MasmBasicQuickReference.htm#Mb1156)
FileWrite "Ramstein.html", FileRead$("http://www.google.com/search?q=rammstein&tbm=isch")
Let esi=FileRead$("Ramstein.html")
Dim ImgUrl$()
xor ecx, ecx
.Repeat
Let edi=Extract$(esi, 'src="http://', '" width="', xsIncL or xsExcR or xsLoop)
.Break .if byte ptr [edi]=="?"
Let ImgUrl$(ecx)=Mid$(edi, 6)
PrintLine Str$(ecx), Tb$, ImgUrl$(ecx)
inc ecx
.Until ecx>99
Inkey Str$("\n%i images found. Store URLs to file? (y)", ecx)
.if eax=="y"
Store "MyURLs.txt", ImgUrl$() ; write URLs to disk ...
ShEx "MyURLs.txt" ; ... and open in Notepad
.endif
Exit
end start
When Notepad pops up, copy a URL and paste it in your browser...
P.S.: Second attachment allows to pick and see an individual image.
*) On a different machine, the page loads only when previously loaded manually; so it's probably from cache. Besides, file size is 480k, and the format is different :(
Finally reached my goal. :biggrin:
Getting the complete html content so i can find the images-urls by google search and loading them to memory.
Hope it works on all Windows systems....
Edit: better to read .asm file in new attachment.
Quote from: Siekmanski on December 04, 2013, 04:09:43 AMEdit: better to read .asm file in new attachment.
Hmmm... hidden payload? ;)
Output:
Internet server
Satus Code: 200 OK
CONTENT_LENGTH: 527516
http://weirdestband.files.wordpress.com/2011/11/rammstein.jpg
Saving Rammstein.jpg ....It takes a while, though... ca. 30 seconds or so.
works here, Marinus :t
XP SP3
maybe 20 seconds - didn't time it - lol
but, it's a big image
don't know where Jochen got 527516
the one i got was a little over 4 MB
QuoteHmmm... hidden payload? ;)
No, just forgot to change tabs to spaces to make the source code more readable. :biggrin:
30 seconds, that's a long long time.
I'll rewrite the code to search and load every 1024 bytes at a time, that should speed things up.
Win8 32 bit : works OK
Quote
Internet server
Satus Code: 200 OK
CONTENT_LENGTH: 526874
http://weirdestband.files.wordpress.com/2011/11/rammstein.jpg
Saving Rammstein.jpg ....
Press any key to continue...
The image is the same as in Jochen's post but the CONTENT_LENGTH DIFFERS
Thanks guys :biggrin:
4MB that's also part of the long time i guess, but you can search for smaller images if you like.
(TBM=isch)
When you search for images, TBM=isch, you can also use the following TBS values:
•Large images: tbs=isz:l
•Medium images: tbs=isz:m
•Icon sized images: tba=isz:i
•Image size larger than 400×300: tbs=isz:lt,islt:qsvga
•Image size larger than 640×480: tbs=isz:lt,islt:vga
•Image size larger than 800×600: tbs=isz:lt,islt:svga
•Image size larger than 1024×768: tbs=isz:lt,islt:xga
•Image size larger than 1600×1200: tbs=isz:lt,islt:2mp
•Image size larger than 2272×1704: tbs=isz:lt,islt:4mp
•Image sized exactly 1000×1000: tbs=isz:ex,iszw:1000,iszh:1000
•Images in full color: tbs=ic:color
•Images in black and white: tbs=ic:gray
•Images that are red: tbs=ic:specific,isc:red [orange, yellow, green, teal, blue, purple, pink, white, gray, black, brown]
•Image type Face: tbs=itp:face
•Image type Photo: tbs=itp:photo
•Image type Clipart: tbs=itp:clipart
•Image type Line drawing: tbs=itp:lineart
•Group images by subject: tbs=isg:to
•Show image sizes in search results: tbs=imgo:1
Example URL: Search in images for "michael jackson" as a phrase, and limit results to 4 megapixel images or larger, color images, face images, and group the results by topic:
http://www.google.com/search?q=%22michael+jackson%22&tbm=isch&tbs=ic:color,isz:lt,islt:4mp,itp:face,isg:to
QuoteThe image is the same as in Jochen's post but the CONTENT_LENGTH DIFFERS
The content differs from time to time, maybe it's updated then with new info.?
well - don't know how you select which image to d/l
i didn't look at the code
but - google selects results based on location and past search history
i may get the same images as Jochen, but in a different order
try creating an HTML page from the first 100 available images
<a> tags are pretty easy
i.e., rather than downloading,
just see what's available to help understand the selection issues
Hi Dave,
CONTENT_LENGTH: is the total length of the html file from google.
Finding the urls by searching for imgurl=http: in the html file and check if it ends with .jpg
The image in my source code is the first one it finds in the html file, but there are many more in the html file.
Downloading the image was pure for checking if the found image-url works.
Next i'll code a routine that gathers all the urls and put them in a list from where i can choose one.
oh - gotcha :t
Now it finds all jpg urls and checks the length of the url ( no longer then 259 bytes + trailing 0 )
And put them in an image list with all the addresses to the url strings.
Some of the strings look like this:
http://www.supermusic.sk/obrazky/2585635_P%252520R%252520Brown%2525202011.jpg
I'll work on a routine to convert those to plain ascii text.
At the bottom of the source is a routine to save one of the images found by image number. ( remove semicolons )
edit: new attachment, added maximum of 128 images to prevent buffer overflow and removed 2 lines of unused code.
when you write the conversion routine, you might want to support something like the following
%2520
that's a tricky one, because "%25" is "%" ;)
so, "%2520" is a space - normally, you'd see it as "%20"
i have seen that in URL's, before
Ever heard of/used OpenSSL (http://slproweb.com/products/Win32OpenSSL.html)?
Url decoding routine done. :biggrin:
% == 25 hex
example:
%252520 is encoded 3 times and represent a space character ( 20 == hex 20 == 32 dec == space )
%2520 is space is encoded 2 times
%20 is 1 time encoded
Decoding routine checks for multiple % and then calculates the value that follows.
http://i1223.photobucket.com/albums/dd517/jgwicked/Rammstein%252520Dec%25252011%2525202010/Rammstein1992.jpg
decoded: http://i1223.photobucket.com/albums/dd517/jgwicked/Rammstein Dec 11 2010/Rammstein1992.jpg
found an error in line 147
cmp edx,260
jz url_to_long
change it to:
cmp edx,260
je url_to_long
:redface:
JZ and JE are the same opcode
Oooh yeahhh ::)
I really need a break it's 5 AM and i need to go to bed :biggrin:
me too - see you tomorrow, Marinus :t
Thanks guys for testing and helping out. :t
I'm done now with the search routines.
Now i can use it for searching album covers in my program.
It's also nice to search for fixed sizes, in this case images 512 by 512 pixels.
final version,
Excellent job. :t
What kind of download time is average for that file size ?
Andy
Thanks :biggrin:
It depends on the speed of your Internet connection i think.
But you can search for different image sizes.
Does the code search for rammstein.jpg or just download that file if it finds it ?
Andy
if you want to search for album cover art,
artist = Rammstein
album = Sehnsucht
the search frase is then this one, it searches for album art images with exact sizes (512 by 512)
/search?q=rammstein sehnsucht album cover&tbm=isch&tbs=isz:ex,iszw:512,iszh:512
But you can search for any image you want,
search for panda?
/search?q=panda&tbm=isch
in the source code the save name "rammstein.jpg" is fixed and saves the first found image ( for testing only )
Hi Marinus :t You get the large HTTP answer from google when you used the UserAgent string that is used currently?
Yes. :biggrin:
But nothing mentioned by microsoft or MSDN. I was playing with it because ""Mozilla/5.0" returned 70 Kb instead of 36 Kb.
So i searched the net for useragent examples as i used it before in the winsock example.
maybe your own real user-agent works: http://www.viewmyuseragent.com/ (http://www.viewmyuseragent.com/)
Quote from: Siekmanski on December 05, 2013, 08:44:19 PM
Yes. :biggrin:
But nothing mentioned by microsoft or MSDN. I was playing with it because ""Mozilla/5.0" returned 70 Kb instead of 36 Kb.
So i searched the net for useragent examples as i used it before in the winsock example.
I think Google servers try to filter automated requests by checking UserAgent, and if it looks like not very similar to the real browser's string, it returns not full answer. Also I noticed that if the're is too many / too frequent requests from one IP, then google blocks the request and provides a captcha to verify that the request was done by people, so the program should not ask for searches too frequently - that is not looks like the people do the search. But you may use the proxies as well - specify one external proxy in there:
invoke InternetOpen,CTXT("ASM example"),INTERNET_OPEN_TYPE_PROXY,CTXT("proxyaddress:proxyport"),CTXT("<local>"),0
so your request will be routed through external proxy with its address "seeing" to google :t If one IP address gets blocked after frequent searches, you may change the proxy and continue seaches :biggrin:
Thanks Antariy, i'll keep this "proxy address" trick in mind. :t
Weird stuff. Make a fake browser or something. Fork another; idk.
Run it so you don't even have to look at the web pages anymore.
Or just script it with your creepy requests.
yes.....
Marinus is very "creepy" - lol
(those creepy Nederlanders)
that's just how he rolls 8)
we've all learned something from Marinus, though :t
:biggrin:
lol
that's just creepy :biggrin:
Hi Marinus. I know this is an old post, but did you suceed to make the proxy trick ?
No, never used the proxy trick.
But did it worked without google blocking ? I´m trying to use google translator, but it keeps blocking (and ask for a captcha) after a few open/close requests.
I don't know, never tested the proxy strategy.
What I do know is, google change their page structures from time to time.