major rewrite wrt parsing, geocoding, and iterating

This commit is contained in:
Harald Milz 2019-03-26 21:37:49 +01:00
parent c93debfd5c
commit 6165de6085
3 changed files with 285 additions and 247 deletions

View file

@ -40,8 +40,8 @@ function gcal_import_worker()
global $wpdb;
$table = $wpdb->prefix.GCAL_TABLE;
$categories = $wpdb->get_results("SELECT gcal_category from $table");
if (empty($categories)) {
$categories = $wpdb->get_results("SELECT gcal_category, gcal_link from $table WHERE gcal_active = '1'");
if ($wpdb->num_rows == 0) {
error_log ("keine Einträge in $wpdb->prefix.GCAL_TABLE gefunden.");
return (0);
}
@ -49,8 +49,16 @@ function gcal_import_worker()
file_put_contents ($file, var_export ($categories, TRUE));
foreach ( $categories as $category) {
error_log ("found category $category");
gcal_import_process_category($category);
error_log ("found category $category->gcal_category");
$table = $wpdb->prefix . 'postmeta';
$post_ids = $wpdb->get_results("SELECT post_id FROM $table WHERE
meta_key = '_gcal_category' AND meta_value = '$category->gcal_category'");
foreach ($post_ids as $post_id) {
error_log ("trashing post_id $post_id->post_id");
wp_trash_post($post_id->post_id);
}
// jetzt die neuen Posts anlegen
gcal_import_do_import($category->gcal_category, $category->gcal_link);
}
error_log ("gcal_import_worker finished", 0);
@ -58,26 +66,277 @@ function gcal_import_worker()
function gcal_import_geocity($location) {
// Wenn die Adresse im Feld Stadt steht, wird sie richtig angezeigt, ergo:
return ($location);
}
function gcal_import_process_category($category) {
function gcal_import_geoshow($location) {
// later
return '';
}
function getHttpCode($http_response_header)
{
if(is_array($http_response_header))
{
$parts=explode(' ',$http_response_header[0]);
if(count($parts)>1) //HTTP/1.0 <code> <text>
return intval($parts[1]); //Get code
}
return 0;
}
function gcal_import_geocode($location) {
error_log ("entering gcal_import_geocode ($location)");
// we try to cache results as we will need many times the same results especially for recurring events.
// we will use a hash for the location because the hash has a fixed length, while the location has not.
// This table will grow indefinitely over time, so we need to add a timestamp field and remove
// entries that are older than, say, 30 days each time.
// this will also cope with Google subtly changing location strings in Maps over time.
// new entries will thus replace outdated ones over time.
global $wpdb;
$table = $wpdb->prefix.GCAL_TABLE;
$query = "SELECT gcal_link from $table WHERE gcal_category = '$category' AND gcal_active = '1' ;";
$link = $wpdb->get_results($query);
error_log ("found active link $link for category $category");
$table = $wpdb->prefix.GCAL_GEO_TABLE;
/*
// CREATE table if it does not exist already.
$query = "CREATE TABLE IF NOT EXISTS $table (
id INT(9) NOT NULL AUTO_INCREMENT,
gcal_geo_hash VARCHAR(40) NOT NULL,
gcal_geo_lat VARCHAR(20) NOT NULL,
gcal_geo_lon VARCHAR(20) NOT NULL,
gcal_geo_timestamp DATETIME NOT NULL,
UNIQUE KEY id (id)
);";
$wpdb->query($query);
*/
// jetzt haben wir category und link.
// erst alle termine von category löschen
$post_ids = $wpdb->get_results("SELECT Id from $wpdb->prefix.postmeta where
key = '_gcal_category' AND key_value = '$category'");
foreach ($post_ids as $post_id) {
error_log ("trashing post_id $post_id");
wp_trash_post($post_id);
$hash = hash ('md5', $location);
$query = "SELECT gcal_geo_lat, gcal_geo_lon FROM $table WHERE gcal_geo_hash = '$hash'";
error_log ("gcal_import_geocode looking up hash $hash location $location");
error_log ("query: $query");
$result = $wpdb->get_row( $query, ARRAY_N );
$file = dirname (__FILE__) . "/$hash-lookup-result.txt";
file_put_contents ($file, var_export ($result, TRUE));
if ( $wpdb->num_rows == 1 ) { // it should only be a single row!
error_log ("gcal_import_geocode found hash $hash lat $result[0] lon $result[1]");
return ($result);
} else {
// do the housekeeping first, before we create a new caching entry.
$outdated = time() - 2592000; // 30 Tage
$query = "DELETE FROM $table WHERE gcal_geo_timestamp < $outdated";
$wpdb->query($query);
$attempts = 0;
$success = false;
// let's be a mobile Firefox Klar browser just for fun.
$useragent = "User-Agent: Mozilla/5.0 (Android 7.0; Mobile; rv:62.0) Gecko/62.0 Firefox/62.0";
// we'll need to be easy with GMaps in order no to get a 429 Too Many Requests.
while ($success == false && $attempts < 3) {
// @ = 'ignore_errors' => TRUE
$url = "https://maps.google.com/maps?q=" . urlencode ($location);
// we use curl instead of file_get_contents because curl does many high level things e.g. redirects and cookies
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
// but Google does not seem to like the useragent. The result is crap.
// curl_setopt($ch, CURLOPT_USERAGENT, $useragent);
// später können wir noch einen proxy einbauen:
// curl_setopt($ch, CURLOPT_PROXY, $proxy);
$result = curl_exec($ch);
$http_code = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if (200 == $http_code) {
$success = true;
} elseif (429 == $http_code) {
time.sleep(2);
error_log ("got a HTTP 429 Too Many Requests on $url");
++$attempts;
} else {
error_log ("Unbekannter HTTP Fehler $http_code");
return array (' ', ' ');
}
}
// ok so $result seems to be valid.
$file = dirname (__FILE__) . "/$hash-result.html";
file_put_contents ($file, $result);
// and now we need to look for:
$pattern = '#www.google.com/maps/preview/place/[^/]+/@([\d\.]+),([\d\.]+),.*#';
preg_match ($pattern, $result, $matches);
$file = dirname (__FILE__) . "/$hash-matches.html";
file_put_contents ($file, var_export ($matches, TRUE));
error_log ("gcal_import_geocode geocoded lat=$matches[1] lon=$matches[2] for hash $hash");
// do the caching now, but only if both values are set.
// $wpdb_insert does all the sanitizing for us.
if ($matches[1] != "" && $matches[2] != "") {
$wpdb->insert($table, array(
'gcal_geo_location' => substr( $location, 0, 128 ),
'gcal_geo_hash' => $hash,
'gcal_geo_lat' => $matches[1],
'gcal_geo_lon' => $matches[2],
'gcal_geo_timestamp' => time(),
));
}
// error handling?
// and return the result:
return array ($matches[1], $matches[2]);
}
}
function gcal_import_do_import($category, $link) {
error_log ("entering gcal_import_do_import($category, $link)");
require_once dirname (__FILE__) . '/../icalparser/src/IcalParser.php';
require_once dirname (__FILE__) . '/../icalparser/src/Recurrence.php';
require_once dirname (__FILE__) . '/../icalparser/src/Freq.php';
require_once dirname (__FILE__) . '/../icalparser/src/WindowsTimezones.php';
$cal = new \om\IcalParser();
$results = $cal->parseFile($link);
$file = dirname (__FILE__) . "/cal-$category-parsed.txt";
file_put_contents ($file, var_export($results, TRUE));
// we must set a current user because we may not be logged in.
$user_id = 1;
$user = get_user_by( 'id', $user_id );
if( $user ) {
wp_set_current_user( $user_id, $user->user_login );
wp_set_auth_cookie( $user_id );
}
foreach ($cal->getSortedEvents() as $r) {
// wenn DTEND in der Vergangenheit liegt, nicht mehr posten. Next.
$now = new DateTime();
// $dtend = new DateTime($r['DTEND']);
$summary = $r['SUMMARY'];
$dtstart = $r['DTSTART']->format('d.m.Y H:i');
if ($r['DTEND'] < $now) {
error_log ("not posting expired event $summary on $dtstart");
continue;
} else {
error_log ("processing $summary on $dtstart");
}
// The zeitstempel. No idea what it's for, but kal3000 seems to use it.
$wpc_from = $r['DTSTART']->format('d.m.Y H:i');
// code borrowed from kal3000_termine_save_postdata which will not be invoked.
$zeitstempel = strftime( strToTime( $wpc_from ) );
if(!$zeitstempel) {
// strftime doesn't seem to work, so let's get creative
preg_match("/([0-9]{1,2}).\s(\w{1,})\s([0-9]{4})\s([0-9]{2}):([0-9]{2})/", $wpc_from, $zeitstempel);
$month_number = "";
for($i=1;$i<=12;$i++){
if(strtolower(date_i18n("F", mktime(0, 0, 0, $i, 1, 0))) == strtolower($zeitstempel[2])){
$month_number = $i;
break;
}
}
$zeit = mktime($zeitstempel[4], $zeitstempel[5], 0, $month_number, $zeitstempel[1], $zeitstempel[3]);
$zeitstempel = date_i18n('U', $zeit);
}
// geocoden
$location = urldecode ($r['LOCATION']);
error_log ("invoking gcal_import_geocode for $location");
$my_latlon = gcal_import_geocode($location);
// create a default form
// $post = get_default_post_to_edit ('termine', false);
$post_type = 'termine';
// why can't I simply call get_default_post_to_edit? This gives an undefined function error!
$post = new stdClass;
$post->ID = 0;
$post->post_author = '';
$post->post_date = '';
$post->post_date_gmt = '';
$post->post_password = '';
$post->post_name = '';
$post->post_type = $post_type;
$post->post_status = 'draft';
$post->to_ping = '';
$post->pinged = '';
$post->comment_status = get_default_comment_status( $post_type );
$post->ping_status = get_default_comment_status( $post_type, 'pingback' );
$post->post_pingback = get_option( 'default_pingback_flag' );
$post->post_category = get_option( 'default_category' );
$post->page_template = 'default';
$post->post_parent = 0;
$post->menu_order = 0;
$post = new WP_Post( $post );
$post->post_content = apply_filters( 'default_content', $post_content, $post );
$post->post_title = apply_filters( 'default_title', $post_title, $post );
$post->post_excerpt = apply_filters( 'default_excerpt', $post_excerpt, $post );
$file = dirname (__FILE__) . '/' . 'post-defaults.txt';
file_put_contents ( $file, var_export ($post, TRUE) );
// TODO:
if ( isset($r['ATTACH']) ) {
// create image attachment and associate with new post
$attach = $r['ATTACH'];
$summary = $r['SUMMARY'];
error_log ("gcal_import_do_import found attachment $attach for $summary");
}
// and fill in the post form
$post->post_author = '1';
$post->post_content = $r['DESCRIPTION'];
$post->post_title = $r['SUMMARY'];
// create an excerpt for the overview page ([wpcalendar kat=...])
if (strlen ($r['DESCRIPTION']) > 160) {
$post->post_excerpt = substr ($r['DESCRIPTION'], 0, 160) . ' ...'; // first 160 chars of DESCRIPTION plus ' ...'
} else {
$post->post_excerpt = $r['DESCRIPTION'];
}
$post->post_status = 'publish';
$post->post_category = array ($category,);
// sanitized title. We will add a timestamp to enable recurring events
// this is not handled properly by wp_insert_post - recurring events would all have the same post_name.
// $post->post_name = $r['DTSTART']->format('Y-m-d-H-i') . '-' . strtolower( urlencode($r['SUMMARY']) ) ;
$post->visibility = 'public';
// now the wpcalendar metas.
$post->meta_input = array(
'_wpcal_from' => $r['DTSTART']->format('d.m.Y H:i'),
'_bis' => $r['DTEND']->format('d.m.Y H:i'),
'_geocity' => gcal_import_geocity($r['LOCATION']),
'_geoshow' => gcal_import_geoshow($r['LOCATION']),
'_lat' => $my_latlon[0],
'_lon' => $my_latlon[1],
'_zoom' => '10',
'_veranstalter' => '',
'_veranstalterlnk' => '',
'_zeitstempel' => $zeitstempel,
'_gcal_category' => $category,
);
// debug
$file = dirname (__FILE__) . '/' . $post->post_name . '-finished.txt';
file_put_contents ( $file, var_export ($post, TRUE) );
$post_id = wp_insert_post( $post, false );
error_log ("posted new post $post_id");
// return ($post_id);
}
// jetzt die neuen Posts anlegen
gcal_import_do_import($category, $link);
}
}

View file

@ -43,7 +43,7 @@ define ('GCAL_GEO_TABLE', 'gcal_import_geocache');
*/
// The real work goes here.
include dirname( __FILE__ ) . "/gcal-import-worker.php";
require_once dirname( __FILE__ ) . "/gcal-import-worker.php";
add_action( 'gcal_import_worker_hook', 'gcal_import_worker' );
@ -93,9 +93,11 @@ function gcal_import_activate()
// empty it first to prevent doublettes
$wpdb->query("DELETE FROM $table WHERE 1=1");
$wpdb->query("INSERT INTO $table(gcal_category, gcal_link, gcal_active)
VALUES('kv-freising', 'https://calendar.google.com/calendar/ical/gruene.freising%40gmail.com/public/basic.ics', '1')");
VALUES('Kreisverband', 'https://calendar.google.com/calendar/ical/gruene.freising%40gmail.com/public/basic.ics', '1')");
/*
$wpdb->query("INSERT INTO $table(gcal_category, gcal_link, gcal_active)
VALUES('ov-freising', '/tmp/neufahrn.ics', '1')");
*/
// CREATE geocaching table if it does not exist already.
// the location field will be used only during development and debugging, and will be omitted in production.
@ -106,7 +108,7 @@ function gcal_import_activate()
gcal_geo_hash VARCHAR(40) NOT NULL,
gcal_geo_lat VARCHAR(20) NOT NULL,
gcal_geo_lon VARCHAR(20) NOT NULL,
gcal_geo_timestamp DATETIME NOT NULL,
gcal_geo_timestamp INT(16) NOT NULL,
UNIQUE KEY id (id)
);";
$wpdb->query($query);
@ -115,7 +117,7 @@ function gcal_import_activate()
// do it once now! Won't work if the table hasn't been populated yet.
$result = $wpdb->query("SELECT gcal_category FROM $table");
if ($result != 0) {
gcal_import_worker;
gcal_import_worker();
}
// and start the scheduler;
// in production, we will do this hourly.

View file

@ -1,223 +0,0 @@
<?php
defined( 'ABSPATH' ) or die( 'No script kiddies please!' );
function gcal_import_geocity($location) {
// Wenn die Adresse im Feld Stadt steht, wird sie richtig angezeigt, ergo:
return ($location);
}
function gcal_import_geoshow($location) {
// later
return '';
}
function getHttpCode($http_response_header)
{
if(is_array($http_response_header))
{
$parts=explode(' ',$http_response_header[0]);
if(count($parts)>1) //HTTP/1.0 <code> <text>
return intval($parts[1]); //Get code
}
return 0;
}
define ('GCAL_GEO_TABLE', 'gcal_import_geocache');
function gcal_import_geocode($location) {
// we try to cache results as we will need many times the same results especially for recurring events.
// we will use a hash for the location because the hash has a fixed length, while the location has not.
// This table will grow indefinitely over time, so we need to add a timestamp field and remove
// entries that are older than, say, 30 days each time.
// this will also cope with Google subtly changing location strings in Maps over time.
// new entries will thus replace outdated ones over time.
global $wpdb;
$table = $wpdb->prefix.GCAL_GEO_TABLE;
/*
// CREATE table if it does not exist already.
$query = "CREATE TABLE IF NOT EXISTS $table (
id INT(9) NOT NULL AUTO_INCREMENT,
gcal_geo_hash VARCHAR(40) NOT NULL,
gcal_geo_lat VARCHAR(20) NOT NULL,
gcal_geo_lon VARCHAR(20) NOT NULL,
gcal_geo_timestamp DATETIME NOT NULL,
UNIQUE KEY id (id)
);";
$wpdb->query($query);
*/
$hash = hash ('md5', $location);
$query = "SELECT gcal_geo_lat, gcal_geo_lon FROM $table WHERE gcal_geo_hash = $hash";
error_log ("gcal_import_geocode looking up hash $hash location $location");
$result = $wpdb->get_results($query);
if ( ! empty ($result) ) {
error_log ("gcal_import_geocode found hash $hash lat $result[0] lon $result[1]");
return ($result);
} else {
// do the housekeeping first, before we create a new caching entry.
$outdated = DateTime('NOW')->sub( new DateInterval('P30D') );
$query = "DELETE FROM $table WHERE gcal_geo_timestamp < $outdated";
$wpdb->query($query);
$attempts = 0;
$success = false;
// let's be a mobile Firefox Klar browser just for fun.
$opts = array('http' =>
array(
'method' => "GET",
'header' => "User-Agent: Mozilla/5.0 (Android 7.0; Mobile; rv:62.0) Gecko/62.0 Firefox/62.0",
)
);
$context = stream_context_create($opts);
// we'll need to be easy with GMaps in order no to get a 429 Too Many Requests.
while ($success == false && $attempts < 3) {
// @ = 'ignore_errors' => TRUE
$url = 'https://maps.google.com/maps?q=' . urlencode ($location);
$result = file_get_contents ($url, false, $context);
if (429 == getHttpCode($http_response_header)) {
time.sleep(2);
error_log ("got a HTTP 429 Too Many Requests on $url");
++$attempts;
continue;
} else {
$success = true;
}
}
// bail gracefully if the fetch did not work for any reason
if ($result === FALSE) {
return array ("", "");
} else {
// and now we need to look for:
$pattern = '#www.google.com/maps/preview/place/[^/]+/@([\d\.]+),([\d\.]+),.*#';
preg_match ($pattern, $result, $matches);
error_log ("gcal_import_geocode geocoded lat $matches[1] lon $matches[2] for hash $hash");
// do the caching now.
// $wpdb_insert does all the sanitizing for us.
$wpdb->insert($table, array(
'gcal_geo_location' => substr( $location, 0, 128 ),
'gcal_geo_hash' => $hash,
'gcal_geo_lat' => $matches[1],
'gcal_geo_lon' => $matches[2],
'gcal_geo_timestamp' => DateTime('NOW'),
));
// error handling?
// and return the result:
return array ($matches[1], $matches[2]);
}
}
}
function gcal_import_do_import($category, $link) {
// global $_POST;
$post = array();
require_once dirname (__FILE__) . '/../../icalparser/src/IcalParser.php';
require_once dirname (__FILE__) . '/../../icalparser/src/Recurrence.php';
require_once dirname (__FILE__) . '/../../icalparser/src/Freq.php';
require_once dirname (__FILE__) . '/../../icalparser/src/WindowsTimezones.php';
$cal = new \om\IcalParser();
$results = $cal->parseFile($link);
// we must set a current user because we may not be logged in.
$user_id = 1;
$user = get_user_by( 'id', $user_id );
if( $user ) {
wp_set_current_user( $user_id, $user->user_login );
wp_set_auth_cookie( $user_id );
}
foreach ($cal->getSortedEvents() as $r) {
// wenn DTEND in der Vergangenheit liegt, nicht mehr posten. Next.
if (DateTime($r['DTEND']) < DateTime('NOW')) {
continue;
}
// The zeitstempel. No idea what it's for, but kal3000 seems to use it.
$wpc_from = $r['DTSTART']->format(d.m.Y H:i);
// code borrowed from kal3000_termine_save_postdata which will not be invoked.
$zeitstempel = strftime( strToTime( $wpc_from ) );
if(!$zeitstempel) {
// strftime doesn't seem to work, so let's get creative
preg_match("/([0-9]{1,2}).\s(\w{1,})\s([0-9]{4})\s([0-9]{2}):([0-9]{2})/", $wpc_from, $zeitstempel);
$month_number = "";
for($i=1;$i<=12;$i++){
if(strtolower(date_i18n("F", mktime(0, 0, 0, $i, 1, 0))) == strtolower($zeitstempel[2])){
$month_number = $i;
break;
}
}
$zeit = mktime($zeitstempel[4], $zeitstempel[5], 0, $month_number, $zeitstempel[1], $zeitstempel[3]);
$zeitstempel = date_i18n('U', $zeit);
}
// geocoden
$my_latlon = gcal_import_geocode($r['LOCATION']);
// create a default form
$post = get_default_post_to_edit ('termine');
$file = dirname (__FILE__) . '/' . $post['post_name'] . '-defaults.txt';
file_put_contents ( $file, var_export ($post, TRUE) );
// TODO:
if ( ! empty $r['ATTACH'] ) {
// create image attachment and associate with new post
error_log ("gcal_import_do_import found attachment $r['ATTACH'] for $r['SUMMARY']");
}
// and fill in the post form
$post['post_content'] = $r['DESCRIPTION'];
$post['post_title'] = $r['SUMMARY'];
// create an excerpt for the overview page ([wpcalendar kat=...])
if (strlen ($r['DESCRIPTION']) > 160) {
$post['post_excerpt'] = substr ($r['DESCRIPTION'], 0, 160) . ' ...'; // first 160 chars of DESCRIPTION plus ' ...'
} else {
$post['post_excerpt'] = $r['DESCRIPTION'];
}
$post['post_status'] = 'published';
$post['post_category'] = $category;
// sanitized title. We will add a timestamp to enable recurring events
// this is not handled properly by wp_insert_post - recurring events would all have the same post_name.
$post['post_name'] = $r['DTSTART']->format('Y-m-d-H-i') . '-' . strtolower( urlencode($r['SUMMARY']) ) ;
$post['visibility'] = 'public';
// now the wpcalendar metas.
$postmeta = array(
_wpcal_from => $r['DTSTART']->format(d.m.Y H:i),
_bis => $r['DTEND']->format(d.m.Y H:i),
_geocity => gcal_import_geocity($r['LOCATION']),
_geoshow => gcal_import_geoshow($r['LOCATION']),
_lat => $my_latlon[0],
_lon => $my_latlon[1],
_zoom = 10,
_veranstalter = '';
_veranstalterlnk = '',
_zeitstempel = $zeitstempel,
_gcal_category => $category,
);
$post['meta_input'] = $postmeta;
// debug
$file = dirname (__FILE__) . '/' . $post['post_name'] . '-finished.txt';
file_put_contents ( $file, var_export ($post, TRUE) );
$post_id = wp_insert_post( $post, false );
return ($post_id);
}
}