Monday, December 12, 2011

Parse blog to find keywords

We are using php to find relevent blogs based on keywords

$blog_data = array(
            'campaign_id' =>$campaign['campaign_id'],
            'campaign_name' => $campaign['campaign_name'],
            'mandatory_keywords' => $mandatory_keywords,
            'extra_keywords' => $extra_keywords,
            'page_url' => $get_blog['page_url'],
            'page_id' => $get_blog['id'],
            'html' => $get_blog['page_text']
        );
        $blog_parse_result = parse_blog_data($blog_data);









function parse_blog_data($data)
{
    show( "Parse text for: ".$data['page_url']);


    $keywords = array_merge($data['mandatory_keywords'],$data['extra_keywords']);
$page_id=$data['page_id'];//Page id for statistics_pages
$page_url=$data['page_url'];//Page id for statistics_pages
#show( "page url= ".$page_url."<br>");
    if(empty($keywords)){
#        update_blog_row($data['id'],"","NOT_OK",$page_id);
    #    show( "\n************** END: NO KEYWORDS {$data->campaign_name} ($data->campaign_id) ****************\n\n");
        return;
    }

    require_once("blog_parser/phpQuery.php");
    $doc = phpQuery::newDocumentHTML($data['html']);

    $text = "";
    $images = array();

    foreach(pq('p,h1,h2,h3') as $p)
    {
        $p_text = pq($p)->html();

        foreach($keywords as $key)
        {
            #show(  "{$key} > ");
            if(!empty($p_text) && !empty($key) && strpos(strip_tags($p_text), $key))
            {
                // Retrieve text
                $text .= !empty($text) ? "\n\n".strip_tags($p_text) : strip_tags($p_text);

                // retrieve blog images
                foreach(pq($p)->children("img") as $img)
                {
                    $src = pq($img)->attr("src");

                    if(!empty($src) && @GetImageSize($src))
                        $images[] = $src;
                }
                #show("MATCH ************************************************************");
                continue 2; // Finished parsing this p-tag
            }else
            {
                #show("NO MATCH");
            }
        }
    }

    return array(
        'status' => (!empty($text) ? "OK" : "NOT_OK"),
        'text' => utf8_decode($text),
        'images' => $images
        );
}

No comments:

Post a Comment