प्रकाशन तिथि: 21.10.2023

खोज इंजन को नए पृष्ठों की उपस्थिति के बारे में सूचित करने और वर्तमान साइट संरचना को बताने के लिए, आपको एक साइटमैप, एक साइटमैप.xml फ़ाइल बनाने की आवश्यकता है जिसमें सभी आंतरिक लिंक होंगे। इसे बनाने के कई तरीके हैं; आप अपने सीएमएस के लिए बाहरी सेवाओं या अतिरिक्त प्लगइन्स का उपयोग कर सकते हैं। प्रत्येक विधि के अपने फायदे और नुकसान हैं।

इस उदाहरण में, हम एक अलग PHP स्क्रिप्ट का उपयोग करके साइटमैप फ़ाइल बनाने पर विचार करेंगे। हमें यह स्क्रिप्ट इंटरनेट पर मिली और हमने फ़ाइल हेडर में इसके लेखक के लिंक छोड़ दिए। आप उसके पेज पर जा सकते हैं और वहां से इसे डाउनलोड कर सकते हैं। यहां हम इसे फाइलों के रूप में प्रस्तुत करते हैं क्योंकि हमने इसे मामूली संपादन के अधीन किया है ताकि परिणाम हमारी अपेक्षाओं के अनुरूप हो।

1. एक स्क्रिप्ट फ़ोल्डर बनाएँ

mkdir sitemap-gen

1.1 आइए निम्नलिखित नामों से 3 फ़ाइलें बनाएं

cd sitemap-gen
touch sitemap.php
touch sitemap.functions.php
touch sitemap.config.php
sitemap.php निम्नलिखित सामग्री के साथ एक नई sitemap.php फ़ाइल बनाएं:

|**EDIT sitemap.config.php**|


//Read global variables from config file
require_once( 'sitemap.config.php' );

// Include all functions

//Default html header makes browsers ignore \n
header("Content-Type: text/plain");

$color = false;

$version_script = 2;

if ($version_script != $version_functions || $version_functions != $version_config){
	logger("Script versions mismatch!",3);
	logger("Update necessary",3);
	logger("Version of sitemap.functions.php " .$version_functions ,3);
	logger("Version of sitemap.config.php " .$version_config ,3);
	logger("Version of sitemap.php " .$version_script ,3);
	logger("Download new files here:" ,3);

// Add PHP CLI support
if (php_sapi_name() === 'cli' && PHP_OS != 'WINNT') {
    parse_str(implode('&', array_slice($argv, 1)), $args);
    $color = true;

//Allow variable overloading with CLI
if (isset($args['file'])) {
    $file = $args['file'];
if (isset($args['site'])) {
    $site = $args['site'];
if (isset($args['max_depth'])) {
    $max_depth = $args['max_depth'];
if (isset($args['enable_frequency'])) {
    $enable_frequency = $args['enable_frequency'];
if (isset($args['enable_priority'])) {
    $enable_priority = $args['enable_priority'];
if (isset($args['enable_modified'])) {
    $enable_modified = $args['enable_modified'];
if (isset($args['freq'])) {
    $freq = $args['freq'];
if (isset($args['priority'])) {
    $priority = $args['priority'];
if (isset($args['blacklist'])) {
    $blacklist = $args['blacklist'];
if (isset($args['debug'])) {
    $debug = $args['debug'];
if (isset($args['ignore_arguments'])) {
    $ignore_arguments = !!$args['ignore_arguments'];
if (isset($args['pdf_index'])) {
    $pdf_index = $args['pdf_index'];

//Begin stopwatch for statistics
$start = microtime(true);

//Setup file stream
$tempfile = tempnam(sys_get_temp_dir(), 'sitemap.xml.');
$file_stream = fopen($tempfile, "w") or die("Error: Could not create temporary file $tempfile" . "\n");

fwrite($file_stream, $xmlheader . "\n");

// Global variable, non-user defined
$depth = 0;
$indexed = 0;
$scanned = array();
$deferredLinks = array();

// Reduce domain to root in case of monkey
$real_site = domain_root($site);

if ($real_site != $site){
    logger("Reformatted site from $site to $real_site", 2);

// Begin by crawling the original url

// Finalize sitemap
fwrite($file_stream, "</urlset>\n");

// Pretty-print sitemap
 if ((PHP_OS == 'WINNT') ? `where xmllint` : `which xmllint`) {
    logger("Found xmllint, pretty-printing sitemap", 0);
    $responsevalue = exec('xmllint --format ' . $tempfile . ' -o ' . $tempfile . ' 2>&1', $discardedoutputvalue, $returnvalue);
    if ($returnvalue) {
        die("Error: " . $responsevalue . "\n");

// Generate and print out statistics
$time_elapsed_secs = round(microtime(true) - $start, 2);
logger("Sitemap has been generated in " . $time_elapsed_secs . " second" . (($time_elapsed_secs >= 1 ? 's' : '') . "and saved to $file"), 0);
$size = sizeof($scanned);
logger("Scanned a total of $size pages and indexed $indexed pages.", 0);

// Rename partial file to the real file name. `rename()` overwrites any existing files
rename($tempfile, $file);

// Apply permissions
chmod($file, $permissions);

// Declare that the script has finished executing and exit
logger("Operation Completed", 0);

sitemap.functions.php निम्नलिखित सामग्री के साथ एक नई sitemap.functions.php फ़ाइल बनाएँ:

// Abstracted function to output formatted logging
function logger($message, $type)
    global $debug, $color;
    if ($color) {
        switch ($type) {
            case 0:
                echo $debug["add"] ? "\033[0;32m [+] $message \033[0m\n" : "";
            case 1:
                echo $debug["reject"] ? "\033[0;31m [-] $message \033[0m\n" : "";
            case 2:
                echo $debug["warn"] ? "\033[1;33m [!] $message \033[0m\n" : "";
            case 3:
                echo "\033[1;33m [!] $message \033[0m\n";
    switch ($type) {
        case 0:
            echo $debug["add"] ? "[+] $message\n" : "";
        case 1:
            echo $debug["reject"] ? "31m [-] $message\n" : "";
        case 2:
            echo $debug["warn"] ? "[!] $message\n" : "";
        case 3:
            echo "[!] $message\n";

function flatten_url($url)
    global $real_site;
    $path = explode($real_site, $url)[1];
    return $real_site . remove_dot_seg($path);

 * Remove dot segments from a URI path according to RFC3986 Section 5.2.4
 * @param $path
 * @return string
 * @link
function remove_dot_seg($path)
    if (strpos($path, '.') === false) {
        return $path;

    $inputBuffer = $path;
    $outputStack = [];

     * 2.  While the input buffer is not empty, loop as follows:
    while ($inputBuffer != '') {
         * A.  If the input buffer begins with a prefix of "../" or "./",
         *     then remove that prefix from the input buffer; otherwise,
        if (strpos($inputBuffer, "./") === 0) {
            $inputBuffer = substr($inputBuffer, 2);
        if (strpos($inputBuffer, "../") === 0) {
            $inputBuffer = substr($inputBuffer, 3);

         * B.  if the input buffer begins with a prefix of "/./" or "/.",
         *     where "." is a complete path segment, then replace that
         *     prefix with "/" in the input buffer; otherwise,
        if ($inputBuffer === "/.") {
            $outputStack[] = '/';
        if (substr($inputBuffer, 0, 3) === "/./") {
            $inputBuffer = substr($inputBuffer, 2);

         * C.  if the input buffer begins with a prefix of "/../" or "/..",
         *     where ".." is a complete path segment, then replace that
         *     prefix with "/" in the input buffer and remove the last
         *     segment and its preceding "/" (if any) from the output
         *     buffer; otherwise,
        if ($inputBuffer === "/..") {
            $outputStack[] = '/';
        if (substr($inputBuffer, 0, 4) === "/../") {
            $inputBuffer = substr($inputBuffer, 3);

         * D.  if the input buffer consists only of "." or "..", then remove
         *     that from the input buffer; otherwise,
        if ($inputBuffer === '.' || $inputBuffer === '..') {

         * E.  move the first path segment in the input buffer to the end of
         *     the output buffer, including the initial "/" character (if
         *     any) and any subsequent characters up to, but not including,
         *     the next "/" character or the end of the input buffer.
        if (($slashPos = stripos($inputBuffer, '/', 1)) === false) {
            $outputStack[] = $inputBuffer;
        } else {
            $outputStack[] = substr($inputBuffer, 0, $slashPos);
            $inputBuffer = substr($inputBuffer, $slashPos);

    return ltrim(implode($outputStack), "/");

// Check if a URL has already been scanned
function is_scanned($url)
    global $scanned;

    if (isset($scanned[$url])) {
        return true;

    //Check if in array as dir and non-dir
    $url = ends_with($url, "/") ? substr($url, 0, -1) : $url . "/";
    if (isset($scanned[$url])) {
        return true;

    return false;

function ends_with($haystack, $needle)
    $length = strlen($needle);
    if ($length == 0) {
        return true;
    return (substr($haystack, -$length) === $needle);

// Gets path for a relative linl
// =>
// =>
function get_path($path)
    $path_depth = explode("/", $path);
    $len = strlen($path_depth[count($path_depth) - 1]);
    return (substr($path, 0, strlen($path) - $len));

//Get the root of the domain
function domain_root($href)
    $url_parts = explode('/', $href);
    return $url_parts[0] . '//' . $url_parts[2] . '/';

//The curl client is create outside of the function to avoid re-creating it for performance reasons
$curl_client = curl_init();
function get_data($url)
    global $curl_validate_certificate, $curl_client, $index_pdf, $crawler_user_agent, $enable_modified;

    //Set URL
    curl_setopt($curl_client, CURLOPT_URL, $url);
    //Follow redirects and get new url
    curl_setopt($curl_client, CURLOPT_RETURNTRANSFER, 1);
    //Get headers
    curl_setopt($curl_client, CURLOPT_HEADER, 1);
    //Optionally avoid validating SSL
    curl_setopt($curl_client, CURLOPT_SSL_VERIFYPEER, $curl_validate_certificate);
    //Set user agent
    curl_setopt($curl_client, CURLOPT_USERAGENT, $crawler_user_agent);

    //Get data
    $data = curl_exec($curl_client);
    $content_type = curl_getinfo($curl_client, CURLINFO_CONTENT_TYPE);
    $http_code = curl_getinfo($curl_client, CURLINFO_HTTP_CODE);
    $redirect_url = curl_getinfo($curl_client, CURLINFO_REDIRECT_URL);

    //Scan new url, if redirect
    if ($redirect_url) {
        logger("URL is a redirect.", 1);
        if (strpos($redirect_url, '?') !== false) {
            $redirect_url = explode($redirect_url, "?")[0];
        unset($url, $data);

        if (!check_blacklist($redirect_url)) {
            echo logger("Redirected URL is in blacklist", 1);

        } else {

    //If content acceptable, return it. If not, `false`
    $html = ($http_code != 200 || (!stripos($content_type, "html"))) ? false : $data;

    //Additional data
    if ($enable_modified){
	$timestamp = curl_getinfo($curl_client, CURLINFO_FILETIME);
        $modified = date('c', strtotime($timestamp));
//        curl_setopt($curl_client, CURLOPT_FILETIME, true);
//        $timestamp = curl_getinfo($curl_client, CURLINFO_FILETIME);
//        $modified = ($timestamp != -1) ? date('c', $timestamp) : null;
    else {

	$modified = null;


    if (stripos($content_type, "application/pdf") !== false && $index_pdf) {
        $html = "This is a PDF";
    //Return it as an array
    return array($html, $modified, (stripos($content_type, "image/") && $index_img));

//Try to match string against blacklist
function check_blacklist($string)
    global $blacklist;
    if (is_array($blacklist)) {
        foreach ($blacklist as $illegal) {
            if (fnmatch($illegal, $string)) {
                return false;
    return true;

//Extract array of URLs from html document inside of `href`s
function get_links($html, $parent_url, $regexp)
    if (preg_match_all("/$regexp/siU", $html, $matches)) {
        if ($matches[2]) {
            $found = array_map(function ($href) use (&$parent_url) {
                global $real_site, $ignore_arguments;

                logger("Checking $href", 2);

                if (strpos($href, "#") !== false) {
                    logger("Dropping pound.", 2);
                    $href = preg_replace('/\#.*/', '', $href);

                //Seperate $href from $query_string
                $query_string = '';
                if (strpos($href, '?') !== false) {
                    list($href, $query_string) = explode('?', $href);

                    //Parse &amp to not break curl client. See issue #23
                    $query_string = str_replace('&', '&', $query_string);
                if ($ignore_arguments) {
                    $query_string = '';
                if (strpos($href, '?') !== false) {
                    echo "EFEASDEFSED";

                if ((substr($href, 0, 7) != "http://") && (substr($href, 0, 8) != "https://")) {
                    // Link does not call (potentially) external page
                    if (strpos($href, ":")) {
                        logger("URL is an invalid protocol", 1);
                        return false;
                    if ($href == '/') {
                        logger("$href is domain root", 2);
                        $href = $real_site;
                    } elseif (substr($href, 0, 1) == '/') {
                        logger("$href is relative to root, convert to absolute", 2);
                        $href = domain_root($real_site) . substr($href, 1);
                    } else {
                        logger("$href is relative, convert to absolute", 2);
                        $href = get_path($parent_url) . $href;
                logger("Result: $href", 2);
                if (!filter_var($href, FILTER_VALIDATE_URL)) {
                    logger("URL is not valid. Rejecting.", 1);
                    return false;
                if (substr($href, 0, strlen($real_site)) != $real_site) {
                    logger("URL is not part of the target domain. Rejecting.", 1);
                    return false;
                if (is_scanned($href . ($query_string ? '?' . $query_string : ''))) {
                    //logger("URL has already been scanned. Rejecting.", 1);
                    return false;
                if (!check_blacklist($href)) {
                    logger("URL is blacklisted. Rejecting.", 1);
                    return false;
                return flatten_url($href . ($query_string ? '?' . $query_string : ''));
            }, $matches[2]);
            return $found;
    logger("Found nothing", 2);
    return array();

function scan_url($url)
    global $scanned, $deferredLinks, $file_stream, $freq, $priority, $enable_priority, $enable_frequency, $max_depth, $depth, $real_site, $indexed;

    logger("Scanning $url", 2);
    if (is_scanned($url)) {
        logger("URL has already been scanned. Rejecting.", 1);
        return $depth--;
    if (substr($url, 0, strlen($real_site)) != $real_site) {
        logger("URL is not part of the target domain. Rejecting.", 1);
        return $depth--;
    if (!($depth <= $max_depth || $max_depth == 0)) {
        logger("Maximum depth exceeded. Rejecting.", 1);
        return $depth--;

    //Note that URL has been scanned
    $scanned[$url] = 1;

    //Send cURL request
    list($html, $modified, $is_image) = get_data($url);

    if ($is_image) {
        //Url is an image

    if (!$html) {
        logger("Invalid Document. Rejecting.", 1);
        return $depth--;

    if (strpos($url, "&") && strpos($url, ";") === false) {
        $url = str_replace("&", "&", $url);

    $map_row = "	<url>\n";
    $map_row .= "		<loc>$url</loc>\n";
    if ($modified) {
        $map_row .= "		<lastmod>$modified</lastmod>\n";
    if ($enable_frequency) {
        $map_row .= "		<changefreq>$freq</changefreq>\n";
    if ($enable_priority) {
        $map_row .= "		<priority>$priority</priority>\n";
    $map_row .= "	</url>\n";
    fwrite($file_stream, $map_row);
    logger("Added: " . $url . (($modified) ? " [Modified: " . $modified . "]" : ''), 0);
    unset($is_image, $map_row);

    // Extract urls from <a href="??"></a>
    $ahrefs = get_links($html, $url, "<a\s[^>]*href=(\"|'??)([^\" >]*?)\\1[^>]*>(.*)<\/a>");

    // Extract urls from <frame src="??">
    $framesrc = get_links($html, $url, "<frame\s[^>]*src=(\"|'??)([^\" >]*?)\\1[^>]*>");

    $links = array_filter(array_merge($ahrefs, $framesrc), function ($item) use (&$deferredLinks) {
        return $item && !isset($deferredLinks[$item]);
    unset($html, $url, $ahrefs, $framesrc);

    logger("Found urls: " . join(", ", $links), 2);

    //Note that URL has been deferred
    foreach ($links as $href) {
        if ($href) {
            $deferredLinks[$href] = 1;

    foreach ($links as $href) {
        if ($href) {

// fnmatch() filler for non-POSIX systems

if (!function_exists('fnmatch')) {
    function fnmatch($pattern, $string)
        return preg_match("#^" . strtr(preg_quote($pattern, '#'), array('\*' => '.*', '\?' => '.')) . "$#i", $string);
    } // end
} // end if

$version_functions = 2;

sitemap.config.php निम्नलिखित सामग्री के साथ एक नई sitemap.config.php फ़ाइल बनाएँ:
Sitemap Generator by Slava Knyazev. Further acknowledgements in the file. 

I also live on GitHub:
Contact me:

//Make sure to use the latest revision by downloading from github:

/* Usage
Usage is pretty strait forward:
- Configure the crawler by editing this file.
- Select the file to which the sitemap will be saved
- Select URL to crawl
- Configure blacklists, accepts the use of wildcards (example:* and *.jpg)
- Generate sitemap
- Either send a GET request to this script or run it from the command line (refer to README file)
- Submit to Google
- Setup a CRON Job execute this script every so often

It is recommended you don't remove the above for future reference.
// Default site to crawl
$site = "";

// Default sitemap filename
$file = "sitemap.xml";
$permissions = 0644;

// Depth of the crawl, 0 is unlimited
$max_depth = 0;

// Show changefreq
$enable_frequency = true;

// Show priority
$enable_priority = true;

// Default values for changefreq and priority
$freq = "daily";
$priority = "1.00";

// Add lastmod based on server response. Unreliable and disabled by default.
$enable_modified = true;

// Disable this for misconfigured, but tolerable SSL server.
$curl_validate_certificate = true;

// The pages will be excluded from crawl and sitemap.
// Use for exluding non-html files to increase performance and save bandwidth.
$blacklist = array(

// Enable this if your site do requires GET arguments to function
$ignore_arguments = false;

// Not yet implemented. See issue #19 for more information.
$index_img = false;

//Index PDFs
$index_pdf = true;

// Set the user agent for crawler
$crawler_user_agent = "Mozilla/5.0 (compatible; Sitemap Generator Crawler; +";

// Header of the sitemap.xml
$xmlheader ='<?xml version="1.0" encoding="UTF-8"?>
<!-- Sitemap file generated for at '. date("D M j G:i:s T Y") . ' -->
<urlset xmlns=""

// Optionally configure debug options
$debug = array(
    "add" => true,
    "reject" => false,
    "warn" => false

//Modify only if configuration version is broken
$version_config = 2;

फ़ाइलों की सामग्री को उसी नाम से बनाई गई फ़ाइलों में कॉपी करें।

#2. स्क्रिप्ट चलाना

साइटमैप-जेन फ़ोल्डर के मूल में होने के कारण, Domain.tld के बजाय अपना डोमेन निर्दिष्ट करें और वह पथ निर्दिष्ट करें जहां sitemap.xml फ़ाइल सहेजी जाएगी

php sitemap.php file=/home/user/sitemap-gen/sitemap.xml site=https://domain.tld

इस उदाहरण में, हमने php-xml मॉड्यूल के साथ php 8.1 का उपयोग किया

स्क्रिप्ट आपकी साइट के सभी आंतरिक लिंक को बायपास कर देगी और तैयार होने पर, खर्च किए गए समय का संकेत देते हुए, आपको पूरा होने की सूचना देगी।

निर्माण प्रक्रिया के दौरान, आप यह भी देखेंगे कि यह वर्तमान में कौन से पेज क्रॉल कर रहा है।

[+] Added: [Modified: 2023-10-21T17:35:20+01:00] 
[+] Added: [Modified: 2023-10-21T17:35:21+01:00] 
[+] Added: [Modified: 2023-10-21T17:35:21+01:00]

5,000 पृष्ठों वाला साइटमैप बनाने में लगभग 45 मिनट लगे

यदि आप चाहें, तो आप इस कार्य को क्रॉन में जोड़ सकते हैं ताकि स्क्रिप्ट नियमित रूप से फ़ाइल में डेटा को अपडेट करे sitemap.xml

