Jack Kray

  • 0

This script fetches/extracts Title, Description, and Keywords from webpages using specified URL

<!--
<?php
/*
 * URL Fetch Script
 * 
 * This script fetches/extracts Title, Description, and Keywords from webpages
 * using specified URL
 * 
 * Provided by www.forkaya.com
 * 
 */


/*
 * INITIALIZATION SECTION ***************************************************************************************************************************
 */

	$isError = false;
	$submitted = false;
	$sourceVisible = false;
	$sourceText_Visible = 'View Source Code';
	$sourceText_NotVisible = 'Hide Source Code';
	$eMsg = '';
	$aValues = array(
				'url'=>'',
				'title'=>'',
				'description'=>'',
				'keywords'=>''
				);
	//script support charsets needed for encoding purposes
	//add others if needed; it will require custom coding; look for 'charset custom' comments below
	//keep charsets lowercase			
	$aCharsets = array(
				'utf-8', //Unicode
				'iso-8859-1' //Western Europe
				//charset custom: add other charsets as needed
				//'windows-1258' //Vietnamese
				);

/*
 * FUNCTIONS SECTION ***************************************************************************************************************************
 */
	//this function will determine the website's charset
	function get_charset($aCS,$website) {
		
		$result = '';
		$website = strtolower($website); 
		
		//check the http header first
		$pos = strpos($website,'<html');
		if ($pos) {
		    $wsHeader = substr($website,0,$pos);
			//loop through array of charsets
			foreach ($aCS as $val) {
			
				if (strpos($wsHeader,$val) > 0) {
					$result = $val;
					break;
				}
			}
		}

		if (empty($result)) {
			
			//supported charset was not found in the http header

			$wsContentType = '';
			
			$wsDOM = new DOMDocument();
			@$wsDOM->loadHTML($website);
			
			$meta_elements = $wsDOM->getElementsByTagName('meta');
			foreach ($meta_elements as $meta_element) {
				if (strtolower($meta_element->getAttribute('http-equiv')) == 'content-type') {
			    	$wsContentType = strtolower($meta_element->getAttribute('content'));
				}
			}
			
			if ($wsContentType === '') {
				//return empty
			} else {
				// look for specific charsets
				
				//loop through array of charsets
				foreach ($aCS as $val) {
				
					if (strpos($wsContentType,$val) > 0) {
						$result = $val;
						break;
					}
				}
			}
		}
		
		return $result;
	}
	
/*
 * VALIDATION AND ACTION SECTION ********************************************************************************************************************
 */
	
	if (isset($_POST['submit'])) {

		$submitted = true;
		$aValues['url'] = $_POST['url'];
		$aValues['title'] = 'No title';
		$aValues['description'] = 'No description';
		$aValues['keywords'] = 'No keywords';
		
		if (strlen($_POST['url']) == 0) {
			$eMsg .= 'URL cannot be blank.<br />';
			$isError = true;
		}

		if(!$isError) {

			//create a new cURL resource pointing to specified url
			$cURL = curl_init($aValues['url']);
			//include the header in the output. 
			curl_setopt($cURL,CURLOPT_HEADER,true);
			//return the transfer as a string of the return value of curl_exec()
			//instead of outputting it out directly. 
			curl_setopt($cURL,CURLOPT_RETURNTRANSFER,true);
			//set the request timeout in sec.
			curl_setopt($cURL,CURLOPT_TIMEOUT,60);
			//go after redirected pages
			curl_setopt($cURL, CURLOPT_FOLLOWLOCATION, true);
			
			//grab URL and assign it as string to variable
			$reply_page = curl_exec($cURL);

			//echo('<--'.$reply_page.'-->');
			
			//close cURL resource, and free up system resources
			curl_close($cURL);
			
			if (strlen($reply_page) == 0) {
				$eMsg .= 'Website unavailable.<br />';
				$isError = true;
			} else {
				
				//determine the website's charset
				$wbCharset = get_charset($aCharsets,$reply_page);
				
				//we do not need header anymore
				$reply_page = strstr($reply_page,'<html');
				
				//we need to convert to utf-8 because DOMDocument expects it
				switch ($wbCharset) {
					case '':
						//do nothing
						break;
						
					case 'utf-8':
						
						//for the purpose of this script, we can replace 'iso-8859-1' strings with 'utf-8' (if there are any) in the whole website
						$reply_page = str_ireplace('iso-8859-1','utf-8',$reply_page);
						break;
						
					case 'iso-8859-1':

						//for the purpose of this script, we can replace 'iso-8859-1' with 'utf-8' in the whole website
						$reply_page = str_ireplace('iso-8859-1','utf-8',$reply_page);
						
						//encode the website into utf-8
						$reply_page = utf8_encode($reply_page);
						break;
						
					//charset custom: add logic for other charsets as needed
					//case 'windows-1258': //Vietnamese
					//	$reply_page = str_ireplace('windows-1258','utf-8',$reply_page);
					//	write or find a code to encode the charset to utf-8
					//	break;
				}

				//for the purpose of this script, 
				//we can add <meta http-equiv=Content-Type content="text/html; charset=utf-8"> tag
				//right after <head> tag to make DOM 'happy'
				$reply_page = str_ireplace(
					'<head>',
					'<head><meta http-equiv=Content-Type content="text/html; charset=utf-8">',
					$reply_page);
				
				$pageDOM = new DOMDocument();
				@$pageDOM->loadHTML($reply_page);
				
				//Title
				$title_elements = $pageDOM->getElementsByTagName('title');
				if ($title_elements->length <> 0) {
					$aValues['title'] = $title_elements->item(0)->nodeValue;
				}
				
				$meta_elements = $pageDOM->getElementsByTagName('meta');
				foreach ($meta_elements as $meta_element) {
					if (strtolower($meta_element->getAttribute('name')) == 'description') {
				    	$aValues['description'] = $meta_element->getAttribute('content');
					}
					if (strtolower($meta_element->getAttribute('name')) == 'keywords') {
				    	$aValues['keywords'] = $meta_element->getAttribute('content');
					}
				}
			}
		}
		
	}
	
	if (isset($_GET['source'])) {
		if ($_GET['source'] == 1) {
			$sourceStr = file_get_contents('url-fetch-source.php');
			$sourceVisible = true;
			$sourceText = $sourceText_NotVisible;
			$sourceValue = 0;
		} else {
			$sourceVisible = false;
			$sourceText = $sourceText_Visible;
			$sourceValue = 1;
		}
	} else {
		$sourceText = $sourceText_Visible;
		$sourceValue = 1;
	}
	
	header('Content-Type: text/html; charset=utf-8');
	
/*
 * DISPLAY SECTION **********************************************************************************************************************************
 */
?>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
	<head>
		<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
		<meta name="description" content="This PHP script extracts Title, Description, and Keywords from specified URL"/>
		<meta name="keywords" content="php scripting, php, extract, fetch, meta"/>
		<meta name="author" content="forkaya" />
		<link rel="stylesheet" href="../style.css" type="text/css">
		<title>Forkaya - PHP Scripts - URL Fetch - Extract Title, Description, and Keywords from URL</title>
	</head>
	<body>
		<form action="" method="post">
			<table 	align="left" class="bb">
				<tr>
					<td colspan="2" height="30" align="center"  class="aa"><h3><a href="..">Forkaya</a> - <a href=".">PHP Scripts</a> - <a href="./url-fetch.php">URL Fetch</a></h3></td>
				</tr>
				<tr>
					<td colspan="2" height="30" align="left">This script fetches/extracts Title, Description, and Keywords from webpages using specified URL</td>
				</tr>
				<tr>
					<td colspan="2" height="30"></td>
				</tr>
<?php

	if($isError) { 
		echo('
				<tr>
					<td colspan="2" align="left" class="cc">'.$eMsg.'</td>
				</tr>
		'); 
	}

?>
				<tr>
					<td align="left">Enter URL:</td>
					<td align="left"><input type="text" name="url" maxlength="256" size="56" value="<?php echo($aValues['url']);?>"/></td>
				</tr>
				<tr>
					<td align="left"><input type="submit" name="submit" value="Submit"/></td>
					<td align="left"><a href="url-fetch.php?source=<?php echo($sourceValue);?>" class="ff"><?php echo($sourceText); ?></a></td>
				</tr>
				<tr>
					<td colspan="2" align="left"></td>
				</tr>
<?php

	if($submitted and !$isError) { 
		echo('
				<tr>
					<td align="left" valign="top" class="aa">Title: </td>
					<td align="left">'.$aValues['title'].'</td>
				</tr>
				<tr>
					<td align="left" valign="top" class="aa">Description:</td>
					<td>'.$aValues['description'].'</td>
				</tr>
				<tr>
					<td align="left" valign="top" class="aa">Keywords:</td>
					<td align="left">'.$aValues['keywords'].'</td>
				</tr>
		'); 
	}

	if($sourceVisible) { 
		echo('
				<tr>
					<td align="left" colspan="2"><textarea rows="174" cols="210" readonly="readonly" class="ee">'.$sourceStr.'

 

loading...

Related posts

Related posts