Source: http://nadeausoftware.com/articles/2008/05/php_tip_how_parse_and_build_urls#Code
Copyright (c) 2008, David R. Nadeau, NadeauSoftware.com
BSD Licensed.
Modified by Michael Crumley (added match groups)
Text | Result |
---|---|
/search?q=regular%20expression
| match |
http://www.google.com/
| match |
http://www.google.com/search?q=regular%20expression
| match |
www.google.com/
| match |
/*
* RFC3986 specifies the components of a Uniform Resource Identifier (URI).
* A portion of the ABNFs are repeated here:
*
* URI-reference = URI
* / relative-ref
*
* URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
*
* relative-ref = relative-part [ "?" query ] [ "#" fragment ]
*
* hier-part = "//" authority path-abempty
* / path-absolute
* / path-rootless
* / path-empty
*
* relative-part = "//" authority path-abempty
* / path-absolute
* / path-noscheme
* / path-empty
*
* authority = [ userinfo "@" ] host [ ":" port ]
*/
// Character sets from RFC3986.
$xunressub = 'a-zA-Z\d\-._~\!$&\'()*+,;=';
$xpchar = $xunressub . ':@%';
// Scheme from RFC3986.
$xscheme = '([a-zA-Z][a-zA-Z\d+-.]*)';
// User info (user + password) from RFC3986.
$xuserinfo = '(([' . $xunressub . '%]*)' .
'(:([' . $xunressub . ':%]*))?)';
// IPv4 from RFC3986 (without digit constraints).
$xipv4 = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})';
// IPv6 from RFC2732 (without digit and grouping constraints).
$xipv6 = '(\[([a-fA-F\d.:]+)\])';
// Host name from RFC1035. Technically, must start with a letter.
// Relax that restriction to better parse URL structure, then
// leave host name validation to application.
$xhost_name = '([a-zA-Z\d-.%]+)';
// Authority from RFC3986. Skip IP future.
$xhost = '(' . $xhost_name . '|' . $xipv4 . '|' . $xipv6 . ')';
$xport = '(\d*)';
$xauthority = '((' . $xuserinfo . '@)?' . $xhost .
'?(:' . $xport . ')?)';
// Path from RFC3986. Blend absolute & relative for efficiency.
$xslash_seg = '(/[' . $xpchar . ']*)';
$xpath_authabs = '((//' . $xauthority . ')((/[' . $xpchar . ']*)*))';
$xpath_rel = '([' . $xpchar . ']+' . $xslash_seg . '*)';
$xpath_abs = '(/(' . $xpath_rel . ')?)';
$xapath = '(' . $xpath_authabs . '|' . $xpath_abs .
'|' . $xpath_rel . ')';
// Query and fragment from RFC3986.
$xqueryfrag = '([' . $xpchar . '/?' . ']*)';
// URL.
$xurl = '^(' . $xscheme . ':)?' . $xapath . '?' .
'(\?' . $xqueryfrag . ')?(#' . $xqueryfrag . ')?$';