class-sitemap-image-parser.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510
  1. <?php
  2. /**
  3. * WPSEO plugin file.
  4. *
  5. * @package WPSEO\XML_Sitemaps
  6. */
  7. /**
  8. * Parses images from the given post.
  9. */
  10. class WPSEO_Sitemap_Image_Parser {
  11. /**
  12. * Holds the home_url() value to speed up loops.
  13. *
  14. * @var string
  15. */
  16. protected $home_url = '';
  17. /**
  18. * Holds site URL hostname.
  19. *
  20. * @var string
  21. */
  22. protected $host = '';
  23. /**
  24. * Holds site URL protocol.
  25. *
  26. * @var string
  27. */
  28. protected $scheme = 'http';
  29. /**
  30. * Cached set of attachments for multiple posts.
  31. *
  32. * @var array
  33. */
  34. protected $attachments = [];
  35. /**
  36. * Holds blog charset value for use in DOM parsing.
  37. *
  38. * @var string
  39. */
  40. protected $charset = 'UTF-8';
  41. /**
  42. * Set up URL properties for reuse.
  43. */
  44. public function __construct() {
  45. $this->home_url = home_url();
  46. $parsed_home = wp_parse_url( $this->home_url );
  47. if ( ! empty( $parsed_home['host'] ) ) {
  48. $this->host = str_replace( 'www.', '', $parsed_home['host'] );
  49. }
  50. if ( ! empty( $parsed_home['scheme'] ) ) {
  51. $this->scheme = $parsed_home['scheme'];
  52. }
  53. $this->charset = esc_attr( get_bloginfo( 'charset' ) );
  54. }
  55. /**
  56. * Get set of image data sets for the given post.
  57. *
  58. * @param object $post Post object to get images for.
  59. *
  60. * @return array
  61. */
  62. public function get_images( $post ) {
  63. $images = [];
  64. if ( ! is_object( $post ) ) {
  65. return $images;
  66. }
  67. $thumbnail_id = get_post_thumbnail_id( $post->ID );
  68. if ( $thumbnail_id ) {
  69. $src = $this->get_absolute_url( $this->image_url( $thumbnail_id ) );
  70. $alt = WPSEO_Image_Utils::get_alt_tag( $thumbnail_id );
  71. $title = get_post_field( 'post_title', $thumbnail_id );
  72. $images[] = $this->get_image_item( $post, $src, $title, $alt );
  73. }
  74. /**
  75. * Filter: 'wpseo_sitemap_content_before_parse_html_images' - Filters the post content
  76. * before it is parsed for images.
  77. *
  78. * @param string $content The raw/unprocessed post content.
  79. */
  80. $content = apply_filters( 'wpseo_sitemap_content_before_parse_html_images', $post->post_content );
  81. $unfiltered_images = $this->parse_html_images( $content );
  82. foreach ( $unfiltered_images as $image ) {
  83. $images[] = $this->get_image_item( $post, $image['src'], $image['title'], $image['alt'] );
  84. }
  85. foreach ( $this->parse_galleries( $content, $post->ID ) as $attachment ) {
  86. $src = $this->get_absolute_url( $this->image_url( $attachment->ID ) );
  87. $alt = WPSEO_Image_Utils::get_alt_tag( $attachment->ID );
  88. $images[] = $this->get_image_item( $post, $src, $attachment->post_title, $alt );
  89. }
  90. if ( 'attachment' === $post->post_type && wp_attachment_is_image( $post ) ) {
  91. $src = $this->get_absolute_url( $this->image_url( $post->ID ) );
  92. $alt = WPSEO_Image_Utils::get_alt_tag( $post->ID );
  93. $images[] = $this->get_image_item( $post, $src, $post->post_title, $alt );
  94. }
  95. foreach ( $images as $key => $image ) {
  96. if ( empty( $image['src'] ) ) {
  97. unset( $images[ $key ] );
  98. }
  99. }
  100. /**
  101. * Filter images to be included for the post in XML sitemap.
  102. *
  103. * @param array $images Array of image items.
  104. * @param int $post_id ID of the post.
  105. */
  106. $images = apply_filters( 'wpseo_sitemap_urlimages', $images, $post->ID );
  107. return $images;
  108. }
  109. /**
  110. * Get the images in the term description.
  111. *
  112. * @param object $term Term to get images from description for.
  113. *
  114. * @return array
  115. */
  116. public function get_term_images( $term ) {
  117. $images = $this->parse_html_images( $term->description );
  118. foreach ( $this->parse_galleries( $term->description ) as $attachment ) {
  119. $images[] = [
  120. 'src' => $this->get_absolute_url( $this->image_url( $attachment->ID ) ),
  121. 'title' => $attachment->post_title,
  122. 'alt' => WPSEO_Image_Utils::get_alt_tag( $attachment->ID ),
  123. ];
  124. }
  125. return $images;
  126. }
  127. /**
  128. * Parse `<img />` tags in content.
  129. *
  130. * @param string $content Content string to parse.
  131. *
  132. * @return array
  133. */
  134. private function parse_html_images( $content ) {
  135. $images = [];
  136. if ( ! class_exists( 'DOMDocument' ) ) {
  137. return $images;
  138. }
  139. if ( empty( $content ) ) {
  140. return $images;
  141. }
  142. // Prevent DOMDocument from bubbling warnings about invalid HTML.
  143. libxml_use_internal_errors( true );
  144. $post_dom = new DOMDocument();
  145. $post_dom->loadHTML( '<?xml encoding="' . $this->charset . '">' . $content );
  146. // Clear the errors, so they don't get kept in memory.
  147. libxml_clear_errors();
  148. /** @var DOMElement $img */
  149. foreach ( $post_dom->getElementsByTagName( 'img' ) as $img ) {
  150. $src = $img->getAttribute( 'src' );
  151. if ( empty( $src ) ) {
  152. continue;
  153. }
  154. $class = $img->getAttribute( 'class' );
  155. if ( // This detects WP-inserted images, which we need to upsize. R.
  156. ! empty( $class )
  157. && false === strpos( $class, 'size-full' )
  158. && preg_match( '|wp-image-(?P<id>\d+)|', $class, $matches )
  159. && get_post_status( $matches['id'] )
  160. ) {
  161. $src = $this->image_url( $matches['id'] );
  162. }
  163. $src = $this->get_absolute_url( $src );
  164. if ( strpos( $src, $this->host ) === false ) {
  165. continue;
  166. }
  167. if ( $src !== esc_url( $src ) ) {
  168. continue;
  169. }
  170. $images[] = [
  171. 'src' => $src,
  172. 'title' => $img->getAttribute( 'title' ),
  173. 'alt' => $img->getAttribute( 'alt' ),
  174. ];
  175. }
  176. return $images;
  177. }
  178. /**
  179. * Parse gallery shortcodes in a given content.
  180. *
  181. * @param string $content Content string.
  182. * @param int $post_id Optional. ID of post being parsed.
  183. *
  184. * @return array Set of attachment objects.
  185. */
  186. protected function parse_galleries( $content, $post_id = 0 ) {
  187. $attachments = [];
  188. $galleries = $this->get_content_galleries( $content );
  189. foreach ( $galleries as $gallery ) {
  190. $id = $post_id;
  191. if ( ! empty( $gallery['id'] ) ) {
  192. $id = intval( $gallery['id'] );
  193. }
  194. // Forked from core gallery_shortcode() to have exact same logic. R.
  195. if ( ! empty( $gallery['ids'] ) ) {
  196. $gallery['include'] = $gallery['ids'];
  197. }
  198. $gallery_attachments = $this->get_gallery_attachments( $id, $gallery );
  199. $attachments = array_merge( $attachments, $gallery_attachments );
  200. }
  201. return array_unique( $attachments, SORT_REGULAR );
  202. }
  203. /**
  204. * Retrieves galleries from the passed content.
  205. *
  206. * Forked from core to skip executing shortcodes for performance.
  207. *
  208. * @param string $content Content to parse for shortcodes.
  209. *
  210. * @return array A list of arrays, each containing gallery data.
  211. */
  212. protected function get_content_galleries( $content ) {
  213. $galleries = [];
  214. if ( ! preg_match_all( '/' . get_shortcode_regex( [ 'gallery' ] ) . '/s', $content, $matches, PREG_SET_ORDER ) ) {
  215. return $galleries;
  216. }
  217. foreach ( $matches as $shortcode ) {
  218. $attributes = shortcode_parse_atts( $shortcode[3] );
  219. if ( '' === $attributes ) { // Valid shortcode without any attributes. R.
  220. $attributes = [];
  221. }
  222. $galleries[] = $attributes;
  223. }
  224. return $galleries;
  225. }
  226. /**
  227. * Get image item array with filters applied.
  228. *
  229. * @param WP_Post $post Post object for the context.
  230. * @param string $src Image URL.
  231. * @param string $title Optional image title.
  232. * @param string $alt Optional image alt text.
  233. *
  234. * @return array
  235. */
  236. protected function get_image_item( $post, $src, $title = '', $alt = '' ) {
  237. $image = [];
  238. /**
  239. * Filter image URL to be included in XML sitemap for the post.
  240. *
  241. * @param string $src Image URL.
  242. * @param object $post Post object.
  243. */
  244. $image['src'] = apply_filters( 'wpseo_xml_sitemap_img_src', $src, $post );
  245. if ( ! empty( $title ) ) {
  246. $image['title'] = $title;
  247. }
  248. if ( ! empty( $alt ) ) {
  249. $image['alt'] = $alt;
  250. }
  251. /**
  252. * Filter image data to be included in XML sitemap for the post.
  253. *
  254. * @param array $image {
  255. * Array of image data.
  256. *
  257. * @type string $src Image URL.
  258. * @type string $title Image title attribute (optional).
  259. * @type string $alt Image alt attribute (optional).
  260. * }
  261. *
  262. * @param object $post Post object.
  263. */
  264. return apply_filters( 'wpseo_xml_sitemap_img', $image, $post );
  265. }
  266. /**
  267. * Get attached image URL with filters applied. Adapted from core for speed.
  268. *
  269. * @param int $post_id ID of the post.
  270. *
  271. * @return string
  272. */
  273. private function image_url( $post_id ) {
  274. static $uploads;
  275. if ( empty( $uploads ) ) {
  276. $uploads = wp_upload_dir();
  277. }
  278. if ( false !== $uploads['error'] ) {
  279. return '';
  280. }
  281. $file = get_post_meta( $post_id, '_wp_attached_file', true );
  282. if ( empty( $file ) ) {
  283. return '';
  284. }
  285. // Check that the upload base exists in the file location.
  286. if ( 0 === strpos( $file, $uploads['basedir'] ) ) {
  287. $src = str_replace( $uploads['basedir'], $uploads['baseurl'], $file );
  288. }
  289. elseif ( false !== strpos( $file, 'wp-content/uploads' ) ) {
  290. $src = $uploads['baseurl'] . substr( $file, ( strpos( $file, 'wp-content/uploads' ) + 18 ) );
  291. }
  292. else {
  293. // It's a newly uploaded file, therefore $file is relative to the baseurl.
  294. $src = $uploads['baseurl'] . '/' . $file;
  295. }
  296. return apply_filters( 'wp_get_attachment_url', $src, $post_id );
  297. }
  298. /**
  299. * Make absolute URL for domain or protocol-relative one.
  300. *
  301. * @param string $src URL to process.
  302. *
  303. * @return string
  304. */
  305. protected function get_absolute_url( $src ) {
  306. if ( empty( $src ) || ! is_string( $src ) ) {
  307. return $src;
  308. }
  309. if ( WPSEO_Utils::is_url_relative( $src ) === true ) {
  310. if ( $src[0] !== '/' ) {
  311. return $src;
  312. }
  313. // The URL is relative, we'll have to make it absolute.
  314. return $this->home_url . $src;
  315. }
  316. if ( strpos( $src, 'http' ) !== 0 ) {
  317. // Protocol relative URL, we add the scheme as the standard requires a protocol.
  318. return $this->scheme . ':' . $src;
  319. }
  320. return $src;
  321. }
  322. /**
  323. * Returns the attachments for a gallery.
  324. *
  325. * @param int $id The post ID.
  326. * @param array $gallery The gallery config.
  327. *
  328. * @return array The selected attachments.
  329. */
  330. protected function get_gallery_attachments( $id, $gallery ) {
  331. // When there are attachments to include.
  332. if ( ! empty( $gallery['include'] ) ) {
  333. return $this->get_gallery_attachments_for_included( $gallery['include'] );
  334. }
  335. // When $id is empty, just return empty array.
  336. if ( empty( $id ) ) {
  337. return [];
  338. }
  339. return $this->get_gallery_attachments_for_parent( $id, $gallery );
  340. }
  341. /**
  342. * Returns the attachments for the given ID.
  343. *
  344. * @param int $id The post ID.
  345. * @param array $gallery The gallery config.
  346. *
  347. * @return array The selected attachments.
  348. */
  349. protected function get_gallery_attachments_for_parent( $id, $gallery ) {
  350. $query = [
  351. 'posts_per_page' => -1,
  352. 'post_parent' => $id,
  353. ];
  354. // When there are posts that should be excluded from result set.
  355. if ( ! empty( $gallery['exclude'] ) ) {
  356. $query['post__not_in'] = wp_parse_id_list( $gallery['exclude'] );
  357. }
  358. return $this->get_attachments( $query );
  359. }
  360. /**
  361. * Returns an array with attachments for the post IDs that will be included.
  362. *
  363. * @param array $include Array with IDs to include.
  364. *
  365. * @return array The found attachments.
  366. */
  367. protected function get_gallery_attachments_for_included( $include ) {
  368. $ids_to_include = wp_parse_id_list( $include );
  369. $attachments = $this->get_attachments(
  370. [
  371. 'posts_per_page' => count( $ids_to_include ),
  372. 'post__in' => $ids_to_include,
  373. ]
  374. );
  375. $gallery_attachments = [];
  376. foreach ( $attachments as $key => $val ) {
  377. $gallery_attachments[ $val->ID ] = $val;
  378. }
  379. return $gallery_attachments;
  380. }
  381. /**
  382. * Returns the attachments.
  383. *
  384. * @param array $args Array with query args.
  385. *
  386. * @return array The found attachments.
  387. */
  388. protected function get_attachments( $args ) {
  389. $default_args = [
  390. 'post_status' => 'inherit',
  391. 'post_type' => 'attachment',
  392. 'post_mime_type' => 'image',
  393. // Defaults taken from function get_posts.
  394. 'orderby' => 'date',
  395. 'order' => 'DESC',
  396. 'meta_key' => '',
  397. 'meta_value' => '',
  398. 'suppress_filters' => true,
  399. 'ignore_sticky_posts' => true,
  400. 'no_found_rows' => true,
  401. ];
  402. $args = wp_parse_args( $args, $default_args );
  403. $get_attachments = new WP_Query();
  404. return $get_attachments->query( $args );
  405. }
  406. }