URL > Valid URL (RFC 3986) Favorite

Source: http://nadeausoftware.com/articles/2008/05/php_tip_how_parse_and_build_urls#Code

Copyright (c) 2008, David R. Nadeau, NadeauSoftware.com
BSD Licensed.
Modified by Michael Crumley (added match groups)

Sample Matches
Text Result
/search?q=regular%20expression
Group #0
/search?q=regular%20expression
Group #6
/search
Group #8
q=regular%20expression
match
http://www.google.com/
Group #0
http://www.google.com/
scheme
http
Group #3
www.google.com
Group #5
/
match
http://www.google.com/search?q=regular%20expression
Group #0
http://www.google.com/search?q=regular%20expression
scheme
http
Group #3
www.google.com
Group #5
/search
Group #8
q=regular%20expression
match
www.google.com/
Group #0
www.google.com/
Group #7
www.google.com/
match
Notes
/*
 * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
 * A portion of the ABNFs are repeated here:
 *
 *  URI-reference   = URI
 *          / relative-ref
 *
 *  URI     = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
 *
 *  relative-ref    = relative-part [ "?" query ] [ "#" fragment ]
 *
 *  hier-part   = "//" authority path-abempty
 *          / path-absolute
 *          / path-rootless
 *          / path-empty
 *
 *  relative-part   = "//" authority path-abempty
 *          / path-absolute
 *          / path-noscheme
 *          / path-empty
 *
 *  authority   = [ userinfo "@" ] host [ ":" port ]
 */

    // Character sets from RFC3986.
    $xunressub     = 'a-zA-Z\d\-._~\!$&\'()*+,;=';
    $xpchar        = $xunressub . ':@%';

    // Scheme from RFC3986.
    $xscheme        = '([a-zA-Z][a-zA-Z\d+-.]*)';

    // User info (user + password) from RFC3986.
    $xuserinfo     = '((['  . $xunressub . '%]*)' .
                     '(:([' . $xunressub . ':%]*))?)';

    // IPv4 from RFC3986 (without digit constraints).
    $xipv4         = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})';

    // IPv6 from RFC2732 (without digit and grouping constraints).
    $xipv6         = '(\[([a-fA-F\d.:]+)\])';

    // Host name from RFC1035.  Technically, must start with a letter.
    // Relax that restriction to better parse URL structure, then
    // leave host name validation to application.
    $xhost_name    = '([a-zA-Z\d-.%]+)';

    // Authority from RFC3986.  Skip IP future.
    $xhost         = '(' . $xhost_name . '|' . $xipv4 . '|' . $xipv6 . ')';
    $xport         = '(\d*)';
    $xauthority    = '((' . $xuserinfo . '@)?' . $xhost .
                 '?(:' . $xport . ')?)';

    // Path from RFC3986.  Blend absolute & relative for efficiency.
    $xslash_seg    = '(/[' . $xpchar . ']*)';
    $xpath_authabs = '((//' . $xauthority . ')((/[' . $xpchar . ']*)*))';
    $xpath_rel     = '([' . $xpchar . ']+' . $xslash_seg . '*)';
    $xpath_abs     = '(/(' . $xpath_rel . ')?)';
    $xapath        = '(' . $xpath_authabs . '|' . $xpath_abs .
             '|' . $xpath_rel . ')';

    // Query and fragment from RFC3986.
    $xqueryfrag    = '([' . $xpchar . '/?' . ']*)';

    // URL.
    $xurl          = '^(' . $xscheme . ':)?' .  $xapath . '?' .
                     '(\?' . $xqueryfrag . ')?(#' . $xqueryfrag . ')?$';