Getting html content of google search

Siekmanski · December 01, 2013, 01:45:23 AM

Pulling hairs out of my head... :(

I'm trying to get the html content of google search and load it in to memory.
Then i can search for the image urls and show those images in my program.

I've written a routine to connect with the google server and send a search request.
But it redirects (me to the https page and i don't know how to connect to that page.

Code Select


HTTP/1.1 302 Found
Location: https://www.google.com/search?q=rammstein&tbm=isch
Cache-Control: private
Content-Type: text/html; charset=UTF-8
Set-Cookie: PREF=ID=2580d3f61611f780:FF=0:TM=1385822026:LM=1385822026:S=bScjZ7XszAVzBc_T; expires=Mon, 30-Nov-2015 14:33:46 GMT; path=/; domain=.google.com
Set-Cookie: NID=67=PS3svhZFPKcHfZgV1sfXdsBY4nSYd3cGCQgn6JbVrVc-7XmghKnx5NPmqSlsJ2Ib0MZ--IhhpCuxwbdCTtC2hiOWM4GLLwdM_qrIotBmHkzRs08GZvP0sHWMs9ExZP8j; expires=Sun, 01-Jun-2014 14:33:46 GMT; path=/; domain=.google.com; HttpOnly
P3P: CP="This is not a P3P policy! See http://www.google.com/support/accounts/bin/answer.py?hl=en&answer=151657 for more info."
Date: Sat, 30 Nov 2013 14:33:46 GMT
Server: gws
Content-Length: 251
X-XSS-Protection: 1; mode=block
X-Frame-Options: SAMEORIGIN
Alternate-Protocol: 80:quic
Connection: close

<HTML><HEAD><meta http-equiv="content-type" content="text/html;charset=utf-8">
<TITLE>302 Moved</TITLE></HEAD><BODY>

I found out that https goes via port 443 but it doesn't work.
Don't know how to make the request header...........

I can save the html content file with the "URLDownloadToFile" api but that's not what i want.
There must be a way to do it with my routine.

Code Select


    .486
    .model      flat,stdcall 
    option      casemap:none

    include     windows.inc
    include     user32.inc
    includelib  user32.lib
    include     kernel32.inc
    includelib  kernel32.lib

    include     wsock32.inc
    includelib  wsock32.lib
    include     urlmon.inc
    includelib  urlmon.lib

    include     Console.Inc

;https://www.google.com/search?q=rammstein&tbm=isch

GetImages   db "GET /search?q=rammstein&tbm=isch HTTP/1.1",13,10
            db "User-Agent: Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko",13,10
;			db "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; nl; rv:1.8.0.3)",13,10
            db "Host: www.google.com",13,10
            db "Accept: */*",13,10
            db "Connection: close",13,10
            db 13,10,0

.data?
align 4
Internet_Buffer     db 500*1024 dup (?)
szString_buffer     db 128 dup (?)

.code

InternetServer proc uses edi Port:dword,Internet_page:dword,Internet_String:dword

LOCAL   Internet_Socket,Bytes_received:dword
LOCAL   wsaData:WSADATA
LOCAL   Socket_adres:sockaddr_in

    lea     edi,Internet_Buffer
    xor     eax,eax
    mov     ecx,sizeof Internet_Buffer / 4
    cld
    rep     stosd

    mov     Internet_Socket,0
    invoke  WSAStartup,0202h,addr wsaData
    test    eax,eax
    jz      winsock_open
    ret
winsock_open:
    invoke  socket,AF_INET,SOCK_STREAM,IPPROTO_TCP
    cmp     eax,INVALID_SOCKET
    jne     socket_ok
    jmp     Sluit_socket
socket_ok:
    mov     Internet_Socket,eax
    mov     Socket_adres.sin_family,AF_INET
    invoke  htons,Port
    mov     Socket_adres.sin_port,ax
    invoke  gethostbyname,Internet_page
    test    eax,eax
    jnz     server_gevonden
    jmp     Sluit_socket
server_gevonden:
    mov     eax,[eax+12]
    mov     eax,[eax]
    mov     eax,[eax] 
inet_adres_ok:
    mov     Socket_adres.sin_addr,eax
    invoke  connect,Internet_Socket,addr Socket_adres,sizeof Socket_adres
    cmp     eax,SOCKET_ERROR
    jne     contact_met_host
    jmp     Sluit_socket

contact_met_host:
    invoke  Print,TEXT_(13,10," Connected to Google server.",13,10,13,10)
    invoke  lstrlen,Internet_String
    invoke  send,Internet_Socket,Internet_String,eax,0
    
    mov     Bytes_received,0
Lees_socket:
    lea     eax,Internet_Buffer
    add     eax,Bytes_received
    invoke  recv,Internet_Socket,eax,256,0
    cmp     eax,SOCKET_ERROR
    jne     Antwoord_ok
    jmp     Antwoord_klaar
Antwoord_ok:
    test    eax,eax
    jz      Antwoord_klaar
    add     Bytes_received,eax  
    cmp     Bytes_received,sizeof Internet_Buffer-256
    jae     Antwoord_klaar
    jmp     Lees_socket
Antwoord_klaar:
    mov     eax,Bytes_received
    lea     edx,Internet_Buffer
    mov     byte ptr[edx+eax-1],0
    invoke  Print,addr Internet_Buffer

    invoke  wsprintf,addr szString_buffer,TEXT_(13,10,13,10,"We are done: %d bytes received....",13,10),Bytes_received
    invoke  Print,addr szString_buffer

Sluit_socket:
    cmp     Internet_Socket,0
    jz      Sluit_socket_ok
    invoke  closesocket,Internet_Socket
    mov     Internet_Socket,0
Sluit_socket_ok:
    invoke  WSACleanup
    ret
InternetServer endp

start:
    invoke  Print,TEXT_("Internet server",13,10)
    invoke  InternetServer,80,TEXT_("google.com"),addr GetImages

;    this is what i want in memory with my server code, just what URLDownloadToFile saves to disk....

;    invoke  URLDownloadToFile,0,TEXT_("https://www.google.com/search?q=rammstein&tbm=isch"),TEXT_("google_html.txt"),0,0

    invoke  Wait_Key
    invoke  ExitProcess,0
end start

traphunter · December 01, 2013, 09:16:37 AM

Hello,

I think you have to manage SSL/TSL by yourself or you can use the Winsock Secure Socket Extensions. Thats my ideas.

do you know this http://msdn.microsoft.com/en-us/library/windows/desktop/ms740139%28v=vs.85%29.aspx?

c++ sample: http://msdn.microsoft.com/en-us/library/windows/desktop/bb394814%28v=vs.85%29.aspx

Siekmanski · December 01, 2013, 08:34:25 PM

Thank you for the hint traphunter. I'm going to try the secure socket.

Antariy · December 03, 2013, 12:05:17 AM

Hi Marinus :t

Try this code:

include \masm32\include\masm32rt.inc
include \masm32\include\wininet.inc
includelib \masm32\lib\wininet.lib

.686
.mmx
.xmm

.data

.code

start proc
LOCAL hio:DWORD
LOCAL hic:DWORD
LOCAL hir:DWORD
LOCAL tdd:DWORD
LOCAL buf[129]:BYTE

invoke InternetOpen,CTXT("ASM example"),0,0,0,0
mov hio,eax
invoke InternetConnect,eax,CTXT("www.google.com"),443,0,0,INTERNET_SERVICE_HTTP,0,0
mov hic,eax
invoke HttpOpenRequest,eax,CTXT("GET"),CTXT("/search?q=rammstein&tbm=isch"),0,0,0,INTERNET_FLAG_NO_CACHE_WRITE or INTERNET_FLAG_SECURE,0
mov hir,eax
invoke HttpSendRequest,eax,0,0,0,0
@@:
invoke InternetReadFile,hir,addr buf,sizeof buf-1,addr tdd
test eax,eax
jz @F
mov eax,tdd
test eax,eax
jz @F
mov byte ptr [buf+eax],0
invoke crt_printf,CTXT("%s"),addr buf
jmp @B

@@:

invoke InternetCloseHandle,hir
invoke InternetCloseHandle,hic
invoke InternetCloseHandle,hio

invoke crt__getch
invoke crt_exit,0

start endp

end start

This code will connect to the HTTPS server, if you want to connect to HTTP server, then just change the blue from 443 (port) to 80, and remove red part. The specific of this function is that if it will be redirected from HTTP to HTTPS, it will silently go to the redirection and will get the data you need from the secure HTTP server.
I'm not very experienced with these functions, so you may find something interesting you need, I just give an idea, if it is suitable for you :t

dedndave · December 03, 2013, 04:42:20 AM

it dumps a rather complex HTML to the console, here, Alex :t

built with no errors,
running XP SP3

Antariy · December 03, 2013, 05:58:05 AM

Thank you for test, Dave!

Siekmanski · December 03, 2013, 10:28:19 AM

Thank you Antariy,

I'm not very experienced with those functions too, already tried those wininet api functions
and can't get it to save the whole page, which should be +/- 480.000 bytes long.

It only saves about the first 37.000 bytes of the page and all the image urls are not included in that part.
I'm still trying the AcceptTypes and different flags for HttpOpenRequest to get the whole page.
No succes so far..... but keep on trying.

If you use "Mozilla/5.0" instead of "ASM example" it saves 71.000 bytes. ??????

jj2007 · December 03, 2013, 12:45:42 PM

include \masm32\MasmBasic\MasmBasic.inc ; download
Init
FileWrite "Ramstein.html", FileRead$("http://www.google.com/search?q=rammstein&tbm=isch")
Inkey "ok?"
Exit
end start

68k, but it looks complete (see attachment - the page ends with "Privacy & Terms About Google", and there are many images). What's missing, and what's wrong with URLDownloadToFile?

dedndave · December 03, 2013, 01:29:05 PM

when it dumps to my console, the text ends with </HTML>

now, you just have to parse through all that to get the URLs for each pic/page ? (another HTML) :P

Siekmanski · December 03, 2013, 01:37:52 PM

Hi jj2007,

There is nothing wrong with URLDownloadToFile, but i don't want to write a file to disk and then read it back to memory.
Your Ramstein.html has no urls to the images found by google search.

Look for imgurl=http://xxxxxxxxx.jpg

But i figured it all out, and now i can load the complete html result from google to memory.
Finaly i can search for images and show them in my program.

Tommorrow i'll clean up my code and post it, must go to bed now.

Here's a test, Pictures? means 0 = no images found, 1 = yeahhhhh images found

Siekmanski · December 03, 2013, 01:40:20 PM

Hi Dave,

That's exactly the reason why i needed the complete content and get the image urls.

jj2007 · December 03, 2013, 01:48:36 PM

Quote from: dedndave on December 03, 2013, 01:29:05 PM
now, you just have to parse through all that to get the URLs for each pic/page ? (another HTML) :P

Apparently, that file contains only URLs to thumbnails like this one:
http://t3.gstatic.com/images?q=tbn:ANd9GcTj_a-RuWed7-ZC9ab-vmS_98FFRU6Eye1qvMSiixpVsk1g0CbSYyXiXsGv

dedndave · December 03, 2013, 02:04:01 PM

i think there is also a link (or script)
the problem is that the link may be encoded
and - unless you can emulate the PHP file that is on the server - you can't break the code

now - maybe it's not that difficult
but, i doubt google wants just anyone to have the power of google without sticking a google logo and marketing data collector on there :P

the guy didn't make 35 billion by being stupid

jj2007 · December 03, 2013, 02:20:16 PM

Actually, it's not that difficult to extract the image locations. I had thought Google would protect its servers*) but nope, they load just fine. You can even grab the images, see second attachment.

include \masm32\MasmBasic\MasmBasic.inc ; download
Init ; uses Extract$()

FileWrite "Ramstein.html", FileRead$("http://www.google.com/search?q=rammstein&tbm=isch")
Let esi=FileRead$("Ramstein.html")
Dim ImgUrl$()
xor ecx, ecx
.Repeat
Let edi=Extract$(esi, 'src="http://', '" width="', xsIncL or xsExcR or xsLoop)
.Break .if byte ptr [edi]=="?"
Let ImgUrl$(ecx)=Mid$(edi, 6)
PrintLine Str$(ecx), Tb$, ImgUrl$(ecx)
inc ecx
.Until ecx>99
Inkey Str$("\n%i images found. Store URLs to file? (y)", ecx)
.if eax=="y"
Store "MyURLs.txt", ImgUrl$() ; write URLs to disk ...
ShEx "MyURLs.txt" ; ... and open in Notepad
.endif

Exit
end start

When Notepad pops up, copy a URL and paste it in your browser...
P.S.: Second attachment allows to pick and see an individual image.

*) On a different machine, the page loads only when previously loaded manually; so it's probably from cache. Besides, file size is 480k, and the format is different :(

Siekmanski · December 04, 2013, 04:09:43 AM

Finally reached my goal.

Getting the complete html content so i can find the images-urls by google search and loading them to memory.
Hope it works on all Windows systems....

Edit: better to read .asm file in new attachment.

The MASM Forum

News:

Getting html content of google search

Siekmanski

traphunter

Siekmanski

Antariy

dedndave

Antariy

Siekmanski

jj2007

dedndave

Siekmanski

Siekmanski

jj2007

dedndave

jj2007

Siekmanski