// The symbol naming the top-level domain of the USIN. // domain_extensions => // An array of domain extensions, each of which is either // a parenthesized phrase, or a "." followed by a subdomain // symbol. // collection_label => // The label (if any) of a publication or collection. // The "/" separating the domain and the label is stripped off. // item_extensions => // An array of item extensions, each of which is either // a parenthesized phrase, or an operator (excluding "!") // followed by a symbol. // attributes => // An array of attributes at the end of the USIN, each // of which is a symbol followed by an optional parenthesized // phrase. // The "!" symbol introducing each attribute is stripped off. // junk => // Junk following the longest legal USIN found from the input. // // Example 1: // parse_USIN('ISSN/0361-526X:36(1/2)@175') = // {principal_domain => 'ISSN', // domain_extensions => [], // collection_label => '0361-526X', // item_extensions => [':36', '(1/2)', '@175'], // attributes => [], // junk => ''} // This example illustrates journal article citation, // with conventional item extension syntax for volume, issue and page. // // Example 2: // parse_USIN('RDNS(sfu.ca).CMPT/MSc:2000$SerbanTatu!author(1)') = // {principal_domain => 'RDNS', // domain_extensions => ['(sfu.ca)', '.CMPT'], // collection_label => 'MSc', // item_extensions => [':2000', '$SerbanTatu'], // attributes => ['author(1)'], // junk => ''} // // Escape-encoding. If a USIN is escape-encoded: it should be decoded // first, prior to application of parse_USIN. //--------------------------------------------------------------- // // Regular Expressions for USIN Syntactic Components. // // $symbolRE = '[A-Za-z0-9](?:[-_]?[A-Za-z0-9])*'; $phraseRE = '\([-A-Za-z0-9_/:!@$*~+,.]+\)'; // // The BibP specification states that the letter, digit and extender // classes are subject to future extension to include non-ASCII UTF-8 // characters. The following two REs would be needed for such // an extension, but this has not been tested. // // $symbolRE = '[A-Za-z0-9\x80-\xFF](?:[-_]?[A-Za-z0-9\x80-\xFF])*'; // $phraseRE = '\([-A-Za-z0-9_/:!@$*~+,.\x80-\xFF]+\)'; // // The USIN hyphenation convention allows whitespace to be introduced // in some contexts. $hyphRE = '(?:-[\s]*)?'; // $domain_extensionRE = "$hyphRE(\\.$symbolRE|$phraseRE)"; // Match operators of any length, except the single "!" for attributes. $operatorRE= "(?:![/:!@$*~+,.]+|[/:@$*~+,.][/:!@$*~+,.]*)"; $item_extensionRE = "$hyphRE($operatorRE$symbolRE|$phraseRE)"; $attributeRE = "$hyphRE!($symbolRE(?:$phraseRE)?)"; function parse_USIN($USIN_proto) { global $symbolRE, $phraseRE, $domain_extensionRE, $operatorRE, $hyphRE, $item_extensionRE, $attributeRE; // NOTE: The regular expressions are delimited by "#" instead of // the typical "/". This avoids the need to escape a literal // "/" which sometimes occurs in USIN syntax and hence in the REs // below. The "#" character is safe. // $USIN_proto = rawurldecode($USIN_proto); // Trim whitespace. $USIN_proto = trim($USIN_proto); if (!preg_match("#^$symbolRE#", $USIN_proto, $matches)) { return array('junk' => $USIN_proto); } $result_map['principal_domain'] = $matches[0]; $USIN_proto = substr($USIN_proto, strlen($matches[0])); while (preg_match("#^$domain_extensionRE#", $USIN_proto, $matches)) { $USIN_proto = substr($USIN_proto, strlen($matches[0])); array_shift($matches); $result_map['domain_extensions'][] = join("", $matches); } if (preg_match("#^$hyphRE/($symbolRE)#", $USIN_proto, $matches)) { $result_map['collection_label'] = $matches[1]; $USIN_proto = substr($USIN_proto, strlen($matches[0])); while (preg_match("#^$item_extensionRE#", $USIN_proto, $matches)) { $USIN_proto = substr($USIN_proto, strlen($matches[0])); array_shift($matches); $result_map['item_extensions'][] = join("", $matches); } } while (preg_match("#^$attributeRE#", $USIN_proto, $matches)) { $result_map['attributes'][] = $matches[1]; $USIN_proto = substr($USIN_proto, strlen($matches[0])); } $result_map['junk'] = $USIN_proto; return $result_map; } ?>