urlgrab5.php 820 B

123456789101112131415161718192021222324252627282930313233343536373839
  1. <?php
  2. /*
  3. * urlgrab5.php
  4. *
  5. * A simple command-line utility to extract all of the URLS contained
  6. * within <A HREF> tags from a document.
  7. *
  8. * NOTE: Only works with tidy for PHP 5, please see urlgrab.php for tidy for PHP 4.3.x
  9. *
  10. * By: John Coggeshall <john@php.net>
  11. *
  12. * Usage: php urlgrab5.php <file>
  13. *
  14. */
  15. function dump_nodes(tidyNode $node, &$urls = NULL) {
  16. $urls = (is_array($urls)) ? $urls : array();
  17. if(isset($node->id)) {
  18. if($node->id == TIDY_TAG_A) {
  19. $urls[] = $node->attribute['href'];
  20. }
  21. }
  22. if($node->hasChildren()) {
  23. foreach($node->child as $c) {
  24. dump_nodes($c, $urls);
  25. }
  26. }
  27. return $urls;
  28. }
  29. $a = tidy_parse_file($_SERVER['argv'][1]);
  30. $a->cleanRepair();
  31. print_r(dump_nodes($a->html()));
  32. ?>