#1
  1. No Profile Picture
    Registered User
    Devshed Newbie (0 - 499 posts)

    Join Date
    Aug 2013
    Posts
    4
    Rep Power
    0

    Webscrpper cURL ran out of memory


    Hi guys,

    Im trying to make a webscrapper and im having a huge problem when retrieving huge amount of data. I have tried to increase the memory through PHP.ini but its still doesnt solve the problem.

    The webscrapper I want to make is to retrieve data from journal database and put it into an excel file. While it is working with small datasets, it will run out of memory when retrieving large datasets.

    Here is the code :

    PHP Code:
    function fetchRawData($url,$search,$currentpagenumber,$numpage,$numrecordtotalsofar) {
        if(
    $currentpagenumber<$numpage) {
            
    //initialise curl
            
    $url "this is where i put the url with the search term and page number";

            
    $ch curl_init();
            
    curl_setopt($ch,CURLOPT_URL,$url);
            
    curl_setopt($ch,CURLOPT_RETURNTRANSFER,true);
            
    curl_setopt($ch,CURLOPT_FAILONERROR,true);
            
    curl_setopt($ch,CURLOPT_FOLLOWLOCATION,true);
        
    //    curl_setopt($ch,CURLOPT_TIMEOUT,50000);
            
    curl_setopt($ch,156,500000000);
            
    curl_setopt($ch,155,500000000);
            
    curl_setopt($ch,CURLOPT_SSL_VERIFYPEER,false);
            
    curl_setopt($ch,CURLOPT_SSL_VERIFYHOST,false);
            
            
    $data=curl_exec($ch);
            if(!
    $data) {
                echo 
    "<br />curl error: number :".curl_errno($ch)." and the error msg is : ".curl_error($ch)."<br />\n";
                echo 
    "detailed info :";
                
    var_dump(curl_getinfo($ch));
                die();
            }

            
            

            
    //parsing data
            
    $parsedData = array();
            
    phpQuery::newDocumentHTML($data);
            
    $arrtitle = array();
            
            
    $posttitle=1;
            if(
    $currentpagenumber<2) {
                
    $numrecordsofar=1;
            } else
            {
                
    $numrecordsofar=$numrecordtotalsofar;
            }


            
    //get the title, author and year of publication
            
    foreach(pq("a") as $link) {
                
    $title pq($link)->text();
                if(
    $title) {
                    
    //use regular expression to get the relevant information
                    
    if (preg_match("*articleDetails.jsp*"pq($link)->attr('href'))&&$gettitle<1) {
                        if(!(
    preg_match("*View full abstract*"$title)))
                        {
                            
    $dummyvar=$numrecordsofar+$posttitle;
                            
    array_push($arrtitle,$title);
                            
    $countrecord++;
                            
    $gettitle=1;
                        }
                    }
                }
            }
            
            
    //get the number of data
            
    foreach(pq("span") as $link) {
                
    $title pq($link)->text();
                if(
    $title) {
                    if (
    preg_match("*display-status results-returned*"pq($link)->attr('class'))) {
                        
    $countnumberonly preg_replace("*Results returned*"""$title);
                        
    $totalpageint intval($countnumberonly);
                        
    //calculate how many pages needed and record the current page
                        
    $totalpageint intval($totalpageint 100)+2;
                    }
                }


            }

            
    //initialise write to excel
            
    $objPHPExcel = new PHPExcel();
            
    $objPHPExcel->getProperties()->setCreator("Maarten Balliauw")
                                         ->
    setLastModifiedBy("Maarten Balliauw")
                                         ->
    setTitle("PHPExcel Test Document")
                                         ->
    setSubject("PHPExcel Test Document")
                                         ->
    setDescription("Test document for PHPExcel, generated using PHP classes.")
                                         ->
    setKeywords("office PHPExcel php")
                                         ->
    setCategory("Test result file");
            
            
    // Set active sheet index to the first sheet, so Excel opens this as the first sheet
            
    $objPHPExcel PHPExcel_IOFactory::load("IEEE_Scrap.xlsx");
            
    $objPHPExcel->setActiveSheetIndex(0);
            
    $objPHPExcel->createSheet();
            
    $row $objPHPExcel->getActiveSheet()->getHighestRow()+1;

            
    //get data from arrays    
            
    for($j=0;$j<count($arrtitle);$j++) {
                if(isset(
    $arrtitle[$j])) {
                    
    $dummyvar=$numrecordsofar+$j;
                    
    $objPHPExcel->getActiveSheet()->SetCellValue('A'.$dummyvar,$arrtitle[$j]);
                } else {
                    
    $dummyvar=$numrecordsofar+$j;
                    
    $globalIEEE[$tempcount+$j][0]="No Data";
                    
    $objPHPExcel->getActiveSheet()->SetCellValue('A'.$dummyvar,"No Data");
                }
            }
            
    $objWriter = new PHPExcel_Writer_Excel2007($objPHPExcel);
            
    $objWriter->save('IEEE_Scrap.xlsx');
            
            
    //close curl and phpexcel
            
    curl_close($ch);
            unset(
    $ch);
            unset(
    $objPHPExcel);
            unset(
    $objWriter);
            
    $currentpagenumber++;
            
    $numrecordtotalsofar=$numrecordtotalsofar+$countrecord;
            
    set_time_limit(0);
            
    sleep(5);

            
    $rawHTML fetchRawData($url,$search,$currentpagenumber,$totalpageint,$numrecordtotalsofar);
            return 
    $data;
        }

    The logic is first I retrieve the data on a page then putting it into an array after parsing it then initalise phpexcel to write the data from the array into excel then unset cURL and phpexcel and then move on to next page.

    Sorry the code is a bit messy as I have tried so many modifications but still cant get it work. Please help me !
  2. #2
  3. No Profile Picture
    Contributing User
    Devshed Expert (3500 - 3999 posts)

    Join Date
    Jul 2003
    Posts
    3,542
    Rep Power
    595
    Why not write directly to a file?
    PHP Code:
    $fp fopen (dirname(__FILE__) . '/localfile.tmp''w+');
    $ch curl_init(str_replace(" ","%20",$url));
    curl_setopt($chCURLOPT_TIMEOUT50);
    curl_setopt($chCURLOPT_FILE$fp); // This streams the response to a file
    curl_setopt($chCURLOPT_FOLLOWLOCATIONtrue);
    curl_exec($ch); 
    curl_close($ch);
    fclose($fp); 
    There are 10 kinds of people in the world. Those that understand binary and those that don't.
  4. #3
  5. Sarcky
    Devshed Supreme Being (6500+ posts)

    Join Date
    Oct 2006
    Location
    Pennsylvania, USA
    Posts
    10,908
    Rep Power
    6352
    Close your curl connection as soon as you're done with it, unset $data as soon as you're done with it, etc. Or just increase the memory limit.

    You could also dump right to a file as suggested, and even dump right to a CSV (which Excel will read)
    HEY! YOU! Read the New User Guide and Forum Rules

    "They that can give up essential liberty to obtain a little temporary safety deserve neither liberty nor safety." -Benjamin Franklin

    "The greatest tragedy of this changing society is that people who never knew what it was like before will simply assume that this is the way things are supposed to be." -2600 Magazine, Fall 2002

    Think we're being rude? Maybe you asked a bad question or you're a Help Vampire. Trying to argue intelligently? Please read this.
  6. #4
  7. No Profile Picture
    Registered User
    Devshed Newbie (0 - 499 posts)

    Join Date
    Aug 2013
    Posts
    4
    Rep Power
    0
    Originally Posted by gw1500se
    Why not write directly to a file?
    PHP Code:
    $fp fopen (dirname(__FILE__) . '/localfile.tmp''w+');
    $ch curl_init(str_replace(" ","%20",$url));
    curl_setopt($chCURLOPT_TIMEOUT50);
    curl_setopt($chCURLOPT_FILE$fp); // This streams the response to a file
    curl_setopt($chCURLOPT_FOLLOWLOCATIONtrue);
    curl_exec($ch); 
    curl_close($ch);
    fclose($fp); 
    Hi thanks for the reply. I think I would need to parse the data first before saving it to Excel file before dumping them so maybe this method may not be suitable ?

    Close your curl connection as soon as you're done with it, unset $data as soon as you're done with it, etc. Or just increase the memory limit.

    You could also dump right to a file as suggested, and even dump right to a CSV (which Excel will read)
    I did unset curl and phpexcel at the beginning of the function and i also put curl_close($ch) before moving on to the next page. I also increase the memory limit at php.ini to 1024M. Still erorr (
  8. #5
  9. No Profile Picture
    Contributing User
    Devshed Expert (3500 - 3999 posts)

    Join Date
    Jul 2003
    Posts
    3,542
    Rep Power
    595
    When you stream to a file, you can then process that data line by line and avoid the memory limit problem.
    There are 10 kinds of people in the world. Those that understand binary and those that don't.
  10. #6
  11. Sarcky
    Devshed Supreme Being (6500+ posts)

    Join Date
    Oct 2006
    Location
    Pennsylvania, USA
    Posts
    10,908
    Rep Power
    6352
    I did unset curl and phpexcel at the beginning of the function
    The unsets are 6 lines from the END of the function. I said as soon as you're done with them. The instant you no longer need these large variables, destroy them.

    You never unset $data, which is the largest variable in this script. In fact, you only use $data ONCE, on this line:
    PHP Code:
            phpQuery::newDocumentHTML($data); 
    Since phpQuery doesn't operate on a reference, this line does nothing but increase your memory footprint.

    You later return $data, unprocessed, for no reason, again creating a very large memory use for no reason.

    This function also calls itself recursively, meaning its memory footprint grows geometrically with N (number of pages in the results). If at all possible, remove the recursion and use a loop outside this function to call it over and over, ensuring that the memory space for each function remains self-contained. Use the function's return value (which you're not using at all) to track the page number and anything else your loop needs.
    HEY! YOU! Read the New User Guide and Forum Rules

    "They that can give up essential liberty to obtain a little temporary safety deserve neither liberty nor safety." -Benjamin Franklin

    "The greatest tragedy of this changing society is that people who never knew what it was like before will simply assume that this is the way things are supposed to be." -2600 Magazine, Fall 2002

    Think we're being rude? Maybe you asked a bad question or you're a Help Vampire. Trying to argue intelligently? Please read this.
  12. #7
  13. No Profile Picture
    Registered User
    Devshed Newbie (0 - 499 posts)

    Join Date
    Aug 2013
    Posts
    4
    Rep Power
    0
    Hi ManiacDan,

    Thank you so much for your help. Im modifying it now. In the meantime, is there any script to track memory usage during execution ?
  14. #8
  15. Sarcky
    Devshed Supreme Being (6500+ posts)

    Join Date
    Oct 2006
    Location
    Pennsylvania, USA
    Posts
    10,908
    Rep Power
    6352
    [PHPNET="memory_get_usage"]memory_get_usage[/PHPNET] returns your current memory usage. You have to track it yourself by using that to echo debug output.
    HEY! YOU! Read the New User Guide and Forum Rules

    "They that can give up essential liberty to obtain a little temporary safety deserve neither liberty nor safety." -Benjamin Franklin

    "The greatest tragedy of this changing society is that people who never knew what it was like before will simply assume that this is the way things are supposed to be." -2600 Magazine, Fall 2002

    Think we're being rude? Maybe you asked a bad question or you're a Help Vampire. Trying to argue intelligently? Please read this.
  16. #9
  17. No Profile Picture
    Registered User
    Devshed Newbie (0 - 499 posts)

    Join Date
    Aug 2013
    Posts
    4
    Rep Power
    0
    Thumbs up !! thank you, maniacdan !!

IMN logo majestic logo threadwatch logo seochat tools logo