News:

Masm32 SDK description, downloads and other helpful links
Message to All Guests

Main Menu

Getting html content of google search

Started by Siekmanski, December 01, 2013, 01:45:23 AM

Previous topic - Next topic

Siekmanski

Pulling hairs out of my head...   :(

I'm trying to get the html content of google search and load it in to memory.
Then i can search for the image urls and show those images in my program.

I've written a routine to connect with the google server and send a search request.
But it redirects (me to the https page and i don't know how to connect to that page.


HTTP/1.1 302 Found
Location: https://www.google.com/search?q=rammstein&tbm=isch
Cache-Control: private
Content-Type: text/html; charset=UTF-8
Set-Cookie: PREF=ID=2580d3f61611f780:FF=0:TM=1385822026:LM=1385822026:S=bScjZ7XszAVzBc_T; expires=Mon, 30-Nov-2015 14:33:46 GMT; path=/; domain=.google.com
Set-Cookie: NID=67=PS3svhZFPKcHfZgV1sfXdsBY4nSYd3cGCQgn6JbVrVc-7XmghKnx5NPmqSlsJ2Ib0MZ--IhhpCuxwbdCTtC2hiOWM4GLLwdM_qrIotBmHkzRs08GZvP0sHWMs9ExZP8j; expires=Sun, 01-Jun-2014 14:33:46 GMT; path=/; domain=.google.com; HttpOnly
P3P: CP="This is not a P3P policy! See http://www.google.com/support/accounts/bin/answer.py?hl=en&answer=151657 for more info."
Date: Sat, 30 Nov 2013 14:33:46 GMT
Server: gws
Content-Length: 251
X-XSS-Protection: 1; mode=block
X-Frame-Options: SAMEORIGIN
Alternate-Protocol: 80:quic
Connection: close

<HTML><HEAD><meta http-equiv="content-type" content="text/html;charset=utf-8">
<TITLE>302 Moved</TITLE></HEAD><BODY>


I found out that https goes via port 443 but it doesn't work.
Don't know how to make the request header...........

I can save the html content file with the "URLDownloadToFile" api but that's not what i want.
There must be a way to do it with my routine.


    .486
    .model      flat,stdcall
    option      casemap:none

    include     windows.inc
    include     user32.inc
    includelib  user32.lib
    include     kernel32.inc
    includelib  kernel32.lib

    include     wsock32.inc
    includelib  wsock32.lib
    include     urlmon.inc
    includelib  urlmon.lib

    include     Console.Inc

;https://www.google.com/search?q=rammstein&tbm=isch

GetImages   db "GET /search?q=rammstein&tbm=isch HTTP/1.1",13,10
            db "User-Agent: Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko",13,10
; db "User-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; nl; rv:1.8.0.3)",13,10
            db "Host: www.google.com",13,10
            db "Accept: */*",13,10
            db "Connection: close",13,10
            db 13,10,0

.data?
align 4
Internet_Buffer     db 500*1024 dup (?)
szString_buffer     db 128 dup (?)

.code

InternetServer proc uses edi Port:dword,Internet_page:dword,Internet_String:dword

LOCAL   Internet_Socket,Bytes_received:dword
LOCAL   wsaData:WSADATA
LOCAL   Socket_adres:sockaddr_in

    lea     edi,Internet_Buffer
    xor     eax,eax
    mov     ecx,sizeof Internet_Buffer / 4
    cld
    rep     stosd

    mov     Internet_Socket,0
    invoke  WSAStartup,0202h,addr wsaData
    test    eax,eax
    jz      winsock_open
    ret
winsock_open:
    invoke  socket,AF_INET,SOCK_STREAM,IPPROTO_TCP
    cmp     eax,INVALID_SOCKET
    jne     socket_ok
    jmp     Sluit_socket
socket_ok:
    mov     Internet_Socket,eax
    mov     Socket_adres.sin_family,AF_INET
    invoke  htons,Port
    mov     Socket_adres.sin_port,ax
    invoke  gethostbyname,Internet_page
    test    eax,eax
    jnz     server_gevonden
    jmp     Sluit_socket
server_gevonden:
    mov     eax,[eax+12]
    mov     eax,[eax]
    mov     eax,[eax]
inet_adres_ok:
    mov     Socket_adres.sin_addr,eax
    invoke  connect,Internet_Socket,addr Socket_adres,sizeof Socket_adres
    cmp     eax,SOCKET_ERROR
    jne     contact_met_host
    jmp     Sluit_socket

contact_met_host:
    invoke  Print,TEXT_(13,10," Connected to Google server.",13,10,13,10)
    invoke  lstrlen,Internet_String
    invoke  send,Internet_Socket,Internet_String,eax,0
   
    mov     Bytes_received,0
Lees_socket:
    lea     eax,Internet_Buffer
    add     eax,Bytes_received
    invoke  recv,Internet_Socket,eax,256,0
    cmp     eax,SOCKET_ERROR
    jne     Antwoord_ok
    jmp     Antwoord_klaar
Antwoord_ok:
    test    eax,eax
    jz      Antwoord_klaar
    add     Bytes_received,eax 
    cmp     Bytes_received,sizeof Internet_Buffer-256
    jae     Antwoord_klaar
    jmp     Lees_socket
Antwoord_klaar:
    mov     eax,Bytes_received
    lea     edx,Internet_Buffer
    mov     byte ptr[edx+eax-1],0
    invoke  Print,addr Internet_Buffer

    invoke  wsprintf,addr szString_buffer,TEXT_(13,10,13,10,"We are done: %d bytes received....",13,10),Bytes_received
    invoke  Print,addr szString_buffer

Sluit_socket:
    cmp     Internet_Socket,0
    jz      Sluit_socket_ok
    invoke  closesocket,Internet_Socket
    mov     Internet_Socket,0
Sluit_socket_ok:
    invoke  WSACleanup
    ret
InternetServer endp

start:
    invoke  Print,TEXT_("Internet server",13,10)
    invoke  InternetServer,80,TEXT_("google.com"),addr GetImages

;    this is what i want in memory with my server code, just what URLDownloadToFile saves to disk....

;    invoke  URLDownloadToFile,0,TEXT_("https://www.google.com/search?q=rammstein&tbm=isch"),TEXT_("google_html.txt"),0,0

    invoke  Wait_Key
    invoke  ExitProcess,0
end start

Creative coders use backward thinking techniques as a strategy.

traphunter

Hello,

I think you have to manage SSL/TSL by yourself or you can use the Winsock Secure Socket Extensions. Thats my ideas.

do you know this http://msdn.microsoft.com/en-us/library/windows/desktop/ms740139%28v=vs.85%29.aspx?

c++ sample: http://msdn.microsoft.com/en-us/library/windows/desktop/bb394814%28v=vs.85%29.aspx

Siekmanski

Thank you for the hint traphunter. I'm going to try the secure socket.
Creative coders use backward thinking techniques as a strategy.

Antariy

Hi Marinus :t

Try this code:



include \masm32\include\masm32rt.inc
include \masm32\include\wininet.inc
includelib \masm32\lib\wininet.lib

.686
.mmx
.xmm

.data

.code

start proc
LOCAL hio:DWORD
LOCAL hic:DWORD
LOCAL hir:DWORD
LOCAL tdd:DWORD
LOCAL buf[129]:BYTE

invoke InternetOpen,CTXT("ASM example"),0,0,0,0
mov hio,eax
invoke InternetConnect,eax,CTXT("www.google.com"),443,0,0,INTERNET_SERVICE_HTTP,0,0
mov hic,eax
invoke HttpOpenRequest,eax,CTXT("GET"),CTXT("/search?q=rammstein&tbm=isch"),0,0,0,INTERNET_FLAG_NO_CACHE_WRITE or INTERNET_FLAG_SECURE,0
mov hir,eax
invoke HttpSendRequest,eax,0,0,0,0
@@:
invoke InternetReadFile,hir,addr buf,sizeof buf-1,addr tdd
test eax,eax
jz @F
mov eax,tdd
test eax,eax
jz @F
mov byte ptr [buf+eax],0
invoke crt_printf,CTXT("%s"),addr buf
jmp @B

@@:

invoke InternetCloseHandle,hir
invoke InternetCloseHandle,hic
invoke InternetCloseHandle,hio

invoke crt__getch
invoke crt_exit,0
            
start endp


end start


This code will connect to the HTTPS server, if you want to connect to HTTP server, then just change the blue from 443 (port) to 80, and remove red part. The specific of this function is that if it will be redirected from HTTP to HTTPS, it will silently go to the redirection and will get the data you need from the secure HTTP server.
I'm not very experienced with these functions, so you may find something interesting you need, I just give an idea, if it is suitable for you :t

dedndave

it dumps a rather complex HTML to the console, here, Alex   :t

built with no errors,
running XP SP3


Siekmanski

Thank you Antariy,

I'm not very experienced with those functions too, already tried those wininet api functions
and can't get it to save the whole page, which should be +/- 480.000 bytes long.

It only saves about the first 37.000 bytes of the page and all the image urls are not included in that part.
I'm still trying the AcceptTypes and different flags for HttpOpenRequest to get the whole page.
No succes so far..... but keep on trying.

If you use "Mozilla/5.0" instead of "ASM example" it saves 71.000 bytes. ??????
Creative coders use backward thinking techniques as a strategy.

jj2007

include \masm32\MasmBasic\MasmBasic.inc        ; download
  Init
  FileWrite "Ramstein.html", FileRead$("http://www.google.com/search?q=rammstein&tbm=isch")
  Inkey "ok?"
  Exit
end start


68k, but it looks complete (see attachment - the page ends with "Privacy & Terms About Google", and there are many images). What's missing, and what's wrong with URLDownloadToFile?

dedndave

when it dumps to my console, the text ends with </HTML>

now, you just have to parse through all that to get the URLs for each pic/page ? (another HTML)   :P

Siekmanski

Hi jj2007,

There is nothing wrong with URLDownloadToFile, but i don't want to write a file to disk and then read it back to memory.
Your Ramstein.html has no urls to the images found by google search.

Look for imgurl=http://xxxxxxxxx.jpg

But i figured it all out, and now i can load the complete html result from google to memory.
Finaly i can search for images and show them in my program.

Tommorrow i'll clean up my code and post it, must go to bed now.

Here's a test, Pictures? means 0 = no images found, 1 = yeahhhhh images found

Creative coders use backward thinking techniques as a strategy.

Siekmanski

Hi Dave,

That's exactly the reason why i needed the complete content and get the image urls.
Creative coders use backward thinking techniques as a strategy.

jj2007

Quote from: dedndave on December 03, 2013, 01:29:05 PM
now, you just have to parse through all that to get the URLs for each pic/page ? (another HTML)   :P

Apparently, that file contains only URLs to thumbnails like this one:
http://t3.gstatic.com/images?q=tbn:ANd9GcTj_a-RuWed7-ZC9ab-vmS_98FFRU6Eye1qvMSiixpVsk1g0CbSYyXiXsGv

dedndave

#12
i think there is also a link (or script)
the problem is that the link may be encoded
and - unless you can emulate the PHP file that is on the server - you can't break the code

now - maybe it's not that difficult
but, i doubt google wants just anyone to have the power of google without sticking a google logo and marketing data collector on there   :P

the guy didn't make 35 billion by being stupid

jj2007

#13
Actually, it's not that difficult to extract the image locations. I had thought Google would protect its servers*) but nope, they load just fine. You can even grab the images, see second attachment.

include \masm32\MasmBasic\MasmBasic.inc        ; download
  Init        ; uses Extract$()

  FileWrite "Ramstein.html", FileRead$("http://www.google.com/search?q=rammstein&tbm=isch")
  Let esi=FileRead$("Ramstein.html")
  Dim ImgUrl$()
  xor ecx, ecx
  .Repeat
        Let edi=Extract$(esi, 'src="http://', '" width="', xsIncL or xsExcR or xsLoop)
        .Break .if byte ptr [edi]=="?"
        Let ImgUrl$(ecx)=Mid$(edi, 6)
        PrintLine Str$(ecx), Tb$, ImgUrl$(ecx)
        inc ecx
  .Until ecx>99
  Inkey Str$("\n%i images found. Store URLs to file? (y)", ecx)
  .if eax=="y"
        Store "MyURLs.txt", ImgUrl$()        ; write URLs to disk ...
        ShEx "MyURLs.txt"        ; ... and open in Notepad
  .endif

  Exit
end start


When Notepad pops up, copy a URL and paste it in your browser...
P.S.: Second attachment allows to pick and see an individual image.

*) On a different machine, the page loads only when previously loaded manually; so it's probably from cache. Besides, file size is 480k, and the format is different :(

Siekmanski

Finally reached my goal.  :biggrin:
Getting the complete html content so i can find the images-urls by google search and loading them to memory.
Hope it works on all Windows systems....

Edit: better to read .asm file in new attachment.
Creative coders use backward thinking techniques as a strategy.