diff --git a/composer.lock b/composer.lock index ddee80e37..b42d6b30f 100644 --- a/composer.lock +++ b/composer.lock @@ -16334,16 +16334,16 @@ }, { "name": "unocha/ocha_ai", - "version": "v1.4.0", + "version": "v1.4.4", "source": { "type": "git", "url": "https://github.com/UN-OCHA/ocha_ai.git", - "reference": "688d00de9fef3040bb6da956b9d6b25cbe090d47" + "reference": "c60b5cd940fd14b696990c55282ac5095a0950e4" }, "dist": { "type": "zip", - "url": "https://api.github.com/repos/UN-OCHA/ocha_ai/zipball/688d00de9fef3040bb6da956b9d6b25cbe090d47", - "reference": "688d00de9fef3040bb6da956b9d6b25cbe090d47", + "url": "https://api.github.com/repos/UN-OCHA/ocha_ai/zipball/c60b5cd940fd14b696990c55282ac5095a0950e4", + "reference": "c60b5cd940fd14b696990c55282ac5095a0950e4", "shasum": "" }, "require": { @@ -16375,9 +16375,9 @@ "description": "OCHA AI module", "support": { "issues": "https://github.com/UN-OCHA/ocha_ai/issues", - "source": "https://github.com/UN-OCHA/ocha_ai/tree/v1.4.0" + "source": "https://github.com/UN-OCHA/ocha_ai/tree/v1.4.4" }, - "time": "2024-07-18T10:13:41+00:00" + "time": "2024-07-24T12:29:15+00:00" }, { "name": "unocha/ocha_monitoring", @@ -17314,5 +17314,5 @@ "php": ">=8.2" }, "platform-dev": [], - "plugin-api-version": "2.6.0" + "plugin-api-version": "2.3.0" } diff --git a/config/core.entity_form_display.node.job.default.yml b/config/core.entity_form_display.node.job.default.yml index 41d1c5c66..28533403e 100644 --- a/config/core.entity_form_display.node.job.default.yml +++ b/config/core.entity_form_display.node.job.default.yml @@ -12,6 +12,7 @@ dependencies: - field.field.node.job.field_import_hash - field.field.node.job.field_job_closing_date - field.field.node.job.field_job_experience + - field.field.node.job.field_job_tagger_queue_count - field.field.node.job.field_job_type - field.field.node.job.field_source - field.field.node.job.field_theme @@ -189,6 +190,7 @@ hidden: created: true field_import_guid: true field_import_hash: true + field_job_tagger_queue_count: true langcode: true promote: true reliefweb_job_tagger_info: true diff --git a/config/core.entity_form_display.taxonomy_term.theme.default.yml b/config/core.entity_form_display.taxonomy_term.theme.default.yml index 3878288d5..53de753b0 100644 --- a/config/core.entity_form_display.taxonomy_term.theme.default.yml +++ b/config/core.entity_form_display.taxonomy_term.theme.default.yml @@ -3,6 +3,7 @@ langcode: en status: true dependencies: config: + - field.field.taxonomy_term.theme.field_example_job_posting - taxonomy.vocabulary.theme module: - allowed_formats @@ -23,6 +24,14 @@ content: allowed_formats: hide_help: '1' hide_guidelines: '1' + field_example_job_posting: + type: string_textarea + weight: 26 + region: content + settings: + rows: 5 + placeholder: '' + third_party_settings: { } name: type: string_textfield weight: 0 diff --git a/config/core.entity_view_display.node.job.default.yml b/config/core.entity_view_display.node.job.default.yml index 0dc4ef174..4735853c3 100644 --- a/config/core.entity_view_display.node.job.default.yml +++ b/config/core.entity_view_display.node.job.default.yml @@ -12,6 +12,7 @@ dependencies: - field.field.node.job.field_import_hash - field.field.node.job.field_job_closing_date - field.field.node.job.field_job_experience + - field.field.node.job.field_job_tagger_queue_count - field.field.node.job.field_job_type - field.field.node.job.field_source - field.field.node.job.field_theme @@ -114,6 +115,7 @@ content: hidden: field_import_guid: true field_import_hash: true + field_job_tagger_queue_count: true langcode: true links: true reliefweb_job_tagger_info: true diff --git a/config/core.entity_view_display.node.job.teaser.yml b/config/core.entity_view_display.node.job.teaser.yml index bb4714591..9d88c6d86 100644 --- a/config/core.entity_view_display.node.job.teaser.yml +++ b/config/core.entity_view_display.node.job.teaser.yml @@ -13,6 +13,7 @@ dependencies: - field.field.node.job.field_import_hash - field.field.node.job.field_job_closing_date - field.field.node.job.field_job_experience + - field.field.node.job.field_job_tagger_queue_count - field.field.node.job.field_job_type - field.field.node.job.field_source - field.field.node.job.field_theme @@ -58,6 +59,7 @@ hidden: field_import_guid: true field_import_hash: true field_job_experience: true + field_job_tagger_queue_count: true field_job_type: true field_theme: true langcode: true diff --git a/config/core.entity_view_display.taxonomy_term.theme.default.yml b/config/core.entity_view_display.taxonomy_term.theme.default.yml new file mode 100644 index 000000000..8cf8a9519 --- /dev/null +++ b/config/core.entity_view_display.taxonomy_term.theme.default.yml @@ -0,0 +1,24 @@ +uuid: 987d381c-774a-4e6a-ae66-f8252c87861e +langcode: en +status: true +dependencies: + config: + - field.field.taxonomy_term.theme.field_example_job_posting + - taxonomy.vocabulary.theme + module: + - text +id: taxonomy_term.theme.default +targetEntityType: taxonomy_term +bundle: theme +mode: default +content: + description: + type: text_default + label: hidden + settings: { } + third_party_settings: { } + weight: 0 + region: content +hidden: + field_example_job_posting: true + langcode: true diff --git a/config/field.field.node.job.field_job_tagger_queue_count.yml b/config/field.field.node.job.field_job_tagger_queue_count.yml new file mode 100644 index 000000000..987da0ae1 --- /dev/null +++ b/config/field.field.node.job.field_job_tagger_queue_count.yml @@ -0,0 +1,25 @@ +uuid: ac87ff1a-b49d-47f9-8b81-6a3d30212913 +langcode: en +status: true +dependencies: + config: + - field.storage.node.field_job_tagger_queue_count + - node.type.job +id: node.job.field_job_tagger_queue_count +field_name: field_job_tagger_queue_count +entity_type: node +bundle: job +label: 'Job tagger queue count' +description: '' +required: false +translatable: false +default_value: + - + value: 0 +default_value_callback: '' +settings: + min: 0 + max: null + prefix: '' + suffix: '' +field_type: integer diff --git a/config/field.field.taxonomy_term.theme.field_example_job_posting.yml b/config/field.field.taxonomy_term.theme.field_example_job_posting.yml new file mode 100644 index 000000000..0fb7b8ea1 --- /dev/null +++ b/config/field.field.taxonomy_term.theme.field_example_job_posting.yml @@ -0,0 +1,19 @@ +uuid: 95dd0b94-6bd9-48fa-94b4-58b69fb00763 +langcode: en +status: true +dependencies: + config: + - field.storage.taxonomy_term.field_example_job_posting + - taxonomy.vocabulary.theme +id: taxonomy_term.theme.field_example_job_posting +field_name: field_example_job_posting +entity_type: taxonomy_term +bundle: theme +label: 'Example job posting' +description: '' +required: false +translatable: false +default_value: { } +default_value_callback: '' +settings: { } +field_type: string_long diff --git a/config/field.storage.node.field_job_tagger_queue_count.yml b/config/field.storage.node.field_job_tagger_queue_count.yml new file mode 100644 index 000000000..c8445f719 --- /dev/null +++ b/config/field.storage.node.field_job_tagger_queue_count.yml @@ -0,0 +1,20 @@ +uuid: a8443d1d-5ac8-4d42-b340-a7dc9726ef51 +langcode: en +status: true +dependencies: + module: + - node +id: node.field_job_tagger_queue_count +field_name: field_job_tagger_queue_count +entity_type: node +type: integer +settings: + unsigned: false + size: normal +module: core +locked: false +cardinality: 1 +translatable: true +indexes: { } +persist_with_no_fields: false +custom_storage: false diff --git a/config/field.storage.node.reliefweb_job_tagger_status.yml b/config/field.storage.node.reliefweb_job_tagger_status.yml index c0af98840..b9a931117 100644 --- a/config/field.storage.node.reliefweb_job_tagger_status.yml +++ b/config/field.storage.node.reliefweb_job_tagger_status.yml @@ -22,6 +22,9 @@ settings: - value: processed label: processed + - + value: skipped + label: skipped allowed_values_function: '' module: options locked: true diff --git a/config/ocha_ai_tag.settings.yml b/config/ocha_ai_tag.settings.yml index 2d2a4f50c..a746a18fc 100644 --- a/config/ocha_ai_tag.settings.yml +++ b/config/ocha_ai_tag.settings.yml @@ -14,7 +14,7 @@ defaults: text_splitter: plugin_id: token vector_store: - plugin_id: elasticsearch + plugin_id: elasticsearch_job completion: plugin_id: aws_bedrock source: diff --git a/config/user.role.beta_tester.yml b/config/user.role.beta_tester.yml index 573684923..1eeb30cda 100644 --- a/config/user.role.beta_tester.yml +++ b/config/user.role.beta_tester.yml @@ -4,9 +4,11 @@ status: true dependencies: module: - ocha_ai_chat + - reliefweb_job_tagger id: beta_tester label: 'Beta tester' weight: 7 is_admin: null permissions: - 'access ocha ai chat' + - 'test ocha ai job tag' diff --git a/config/views.view.stalled_jobs.yml b/config/views.view.stalled_jobs.yml new file mode 100644 index 000000000..ebac8722f --- /dev/null +++ b/config/views.view.stalled_jobs.yml @@ -0,0 +1,424 @@ +uuid: 0de05c1e-1423-47cd-ba0f-2e447f3117b8 +langcode: en +status: true +dependencies: + config: + - node.type.job + module: + - node + - options + - user +id: stalled_jobs +label: 'Stalled jobs' +module: views +description: '' +tag: '' +base_table: node_field_data +base_field: nid +display: + default: + id: default + display_title: Default + display_plugin: default + position: 0 + display_options: + title: 'Stalled jobs' + fields: + title: + id: title + table: node_field_data + field: title + relationship: none + group_type: group + admin_label: '' + entity_type: node + entity_field: title + plugin_id: field + label: Title + exclude: false + alter: + alter_text: false + make_link: false + absolute: false + word_boundary: false + ellipsis: false + strip_tags: false + trim: false + html: false + element_type: '' + element_class: '' + element_label_type: '' + element_label_class: '' + element_label_colon: true + element_wrapper_type: '' + element_wrapper_class: '' + element_default_classes: true + empty: '' + hide_empty: false + empty_zero: false + hide_alter_empty: true + click_sort_column: value + type: string + settings: + link_to_entity: true + group_column: value + group_columns: { } + group_rows: true + delta_limit: 0 + delta_offset: 0 + delta_reversed: false + delta_first_last: false + multi_type: separator + separator: ', ' + field_api_classes: false + created: + id: created + table: node_field_data + field: created + relationship: none + group_type: group + admin_label: '' + entity_type: node + entity_field: created + plugin_id: field + label: 'Authored on' + exclude: false + alter: + alter_text: false + text: '' + make_link: false + path: '' + absolute: false + external: false + replace_spaces: false + path_case: none + trim_whitespace: false + alt: '' + rel: '' + link_class: '' + prefix: '' + suffix: '' + target: '' + nl2br: false + max_length: 0 + word_boundary: true + ellipsis: true + more_link: false + more_link_text: '' + more_link_path: '' + strip_tags: false + trim: false + preserve_tags: '' + html: false + element_type: '' + element_class: '' + element_label_type: '' + element_label_class: '' + element_label_colon: true + element_wrapper_type: '' + element_wrapper_class: '' + element_default_classes: true + empty: '' + hide_empty: false + empty_zero: false + hide_alter_empty: true + click_sort_column: value + type: timestamp + settings: + date_format: medium + custom_date_format: '' + timezone: '' + tooltip: + date_format: long + custom_date_format: '' + time_diff: + enabled: false + future_format: '@interval hence' + past_format: '@interval ago' + granularity: 2 + refresh: 60 + description: '' + group_column: value + group_columns: { } + group_rows: true + delta_limit: 0 + delta_offset: 0 + delta_reversed: false + delta_first_last: false + multi_type: separator + separator: ', ' + field_api_classes: false + status: + id: status + table: node_field_data + field: status + relationship: none + group_type: group + admin_label: '' + entity_type: node + entity_field: status + plugin_id: field + label: Published + exclude: false + alter: + alter_text: false + text: '' + make_link: false + path: '' + absolute: false + external: false + replace_spaces: false + path_case: none + trim_whitespace: false + alt: '' + rel: '' + link_class: '' + prefix: '' + suffix: '' + target: '' + nl2br: false + max_length: 0 + word_boundary: true + ellipsis: true + more_link: false + more_link_text: '' + more_link_path: '' + strip_tags: false + trim: false + preserve_tags: '' + html: false + element_type: '' + element_class: '' + element_label_type: '' + element_label_class: '' + element_label_colon: true + element_wrapper_type: '' + element_wrapper_class: '' + element_default_classes: true + empty: '' + hide_empty: false + empty_zero: false + hide_alter_empty: true + click_sort_column: value + type: boolean + settings: + format: default + format_custom_false: '' + format_custom_true: '' + group_column: value + group_columns: { } + group_rows: true + delta_limit: 0 + delta_offset: 0 + delta_reversed: false + delta_first_last: false + multi_type: separator + separator: ', ' + field_api_classes: false + operations: + id: operations + table: node + field: operations + relationship: none + group_type: group + admin_label: '' + entity_type: node + plugin_id: entity_operations + label: Operations + exclude: false + alter: + alter_text: false + text: '' + make_link: false + path: '' + absolute: false + external: false + replace_spaces: false + path_case: none + trim_whitespace: false + alt: '' + rel: '' + link_class: '' + prefix: '' + suffix: '' + target: '' + nl2br: false + max_length: 0 + word_boundary: true + ellipsis: true + more_link: false + more_link_text: '' + more_link_path: '' + strip_tags: false + trim: false + preserve_tags: '' + html: false + element_type: '' + element_class: '' + element_label_type: '' + element_label_class: '' + element_label_colon: true + element_wrapper_type: '' + element_wrapper_class: '' + element_default_classes: true + empty: '' + hide_empty: false + empty_zero: false + hide_alter_empty: true + destination: false + pager: + type: mini + options: + offset: 0 + items_per_page: 10 + total_pages: null + id: 0 + tags: + next: ›› + previous: ‹‹ + expose: + items_per_page: false + items_per_page_label: 'Items per page' + items_per_page_options: '5, 10, 25, 50' + items_per_page_options_all: false + items_per_page_options_all_label: '- All -' + offset: false + offset_label: Offset + exposed_form: + type: basic + options: + submit_button: Apply + reset_button: false + reset_button_label: Reset + exposed_sorts_label: 'Sort by' + expose_sort_order: true + sort_asc_label: Asc + sort_desc_label: Desc + access: + type: perm + options: + perm: 'edit any job content' + cache: + type: tag + options: { } + empty: { } + sorts: + created: + id: created + table: node_field_data + field: created + relationship: none + group_type: group + admin_label: '' + entity_type: node + entity_field: created + plugin_id: date + order: DESC + expose: + label: '' + field_identifier: '' + exposed: false + granularity: second + arguments: { } + filters: + type: + id: type + table: node_field_data + field: type + entity_type: node + entity_field: type + plugin_id: bundle + value: + job: job + reliefweb_job_tagger_status_value: + id: reliefweb_job_tagger_status_value + table: node__reliefweb_job_tagger_status + field: reliefweb_job_tagger_status_value + relationship: none + group_type: group + admin_label: '' + plugin_id: list_field + operator: or + value: + skipped: skipped + group: 1 + exposed: false + expose: + operator_id: '' + label: '' + description: '' + use_operator: false + operator: '' + operator_limit_selection: false + operator_list: { } + identifier: '' + required: false + remember: false + multiple: false + remember_roles: + authenticated: authenticated + reduce: false + is_grouped: false + group_info: + label: '' + description: '' + identifier: '' + optional: true + widget: select + multiple: false + remember: false + default_group: All + default_group_multiple: { } + group_items: { } + reduce_duplicates: false + style: + type: table + row: + type: fields + query: + type: views_query + options: + query_comment: '' + disable_sql_rewrite: false + distinct: false + replica: false + query_tags: { } + relationships: { } + header: { } + footer: { } + display_extenders: { } + cache_metadata: + max-age: -1 + contexts: + - 'languages:language_content' + - 'languages:language_interface' + - url.query_args + - 'user.node_grants:view' + - user.permissions + tags: { } + page_1: + id: page_1 + display_title: Page + display_plugin: page + position: 1 + display_options: + display_extenders: { } + path: admin/content/stalled-jobs + menu: + type: tab + title: 'Stalled jobs' + description: '' + weight: 50 + expanded: false + menu_name: admin + parent: system.admin_content + context: '0' + cache_metadata: + max-age: -1 + contexts: + - 'languages:language_content' + - 'languages:language_interface' + - url.query_args + - 'user.node_grants:view' + - user.permissions + tags: { } diff --git a/html/modules/custom/reliefweb_job_tagger/README.md b/html/modules/custom/reliefweb_job_tagger/README.md new file mode 100644 index 000000000..de70d3e99 --- /dev/null +++ b/html/modules/custom/reliefweb_job_tagger/README.md @@ -0,0 +1,28 @@ +# Reliefweb job tagger + +## Forms + +- `/admin/ai/test-job-tagger-category` +- `/admin/ai/test-job-tagger-theme` + +## Test instructions + +### Clear index + +```bash +drush rw-job:clear +``` + +### Index jobs + +```bash +drush rw-job:index +``` + +### Evaluate + +```bash +drush eval "reliefweb_job_tagger_test_accuracy()" +``` + +Will create a csv file `stats.csv` containing the analysis of all jobs. diff --git a/html/modules/custom/reliefweb_job_tagger/config/install/field.storage.node.reliefweb_job_tagger_status.yml b/html/modules/custom/reliefweb_job_tagger/config/install/field.storage.node.reliefweb_job_tagger_status.yml index f18e3c5a1..c36d2ea5e 100644 --- a/html/modules/custom/reliefweb_job_tagger/config/install/field.storage.node.reliefweb_job_tagger_status.yml +++ b/html/modules/custom/reliefweb_job_tagger/config/install/field.storage.node.reliefweb_job_tagger_status.yml @@ -19,6 +19,9 @@ settings: - value: processed label: processed + - + value: skipped + label: skipped allowed_values_function: '' module: options locked: true diff --git a/html/modules/custom/reliefweb_job_tagger/drush.services.yml b/html/modules/custom/reliefweb_job_tagger/drush.services.yml new file mode 100644 index 000000000..b1d2df5da --- /dev/null +++ b/html/modules/custom/reliefweb_job_tagger/drush.services.yml @@ -0,0 +1,6 @@ +services: + reliefweb_job_tagger.commands: + class: \Drupal\reliefweb_job_tagger\Commands\ReliefJobTaggerCommands + arguments: ['@entity_type.manager', '@ocha_ai_tag.tagger'] + tags: + - { name: drush.command } diff --git a/html/modules/custom/reliefweb_job_tagger/reliefweb_job_tagger.links.menu.yml b/html/modules/custom/reliefweb_job_tagger/reliefweb_job_tagger.links.menu.yml new file mode 100644 index 000000000..449aaad21 --- /dev/null +++ b/html/modules/custom/reliefweb_job_tagger/reliefweb_job_tagger.links.menu.yml @@ -0,0 +1,14 @@ +test_ocha_ai_job_tag: + title: Test job tagger + route_name: test_ocha_ai_job_tag + parent: system.admin +test_ocha_ai_job_tag_category: + title: Test job tagger - Category + description: 'Test category tagging for jobs' + route_name: test_ocha_ai_job_tag_category + parent: test_ocha_ai_job_tag +test_ocha_ai_job_tag_theme: + title: Test job tagger - Theme + description: 'Test theme tagging for jobs' + route_name: test_ocha_ai_job_tag_theme + parent: test_ocha_ai_job_tag diff --git a/html/modules/custom/reliefweb_job_tagger/reliefweb_job_tagger.links.task.yml b/html/modules/custom/reliefweb_job_tagger/reliefweb_job_tagger.links.task.yml new file mode 100644 index 000000000..9132e7d11 --- /dev/null +++ b/html/modules/custom/reliefweb_job_tagger/reliefweb_job_tagger.links.task.yml @@ -0,0 +1,5 @@ +reliefweb_job_tagger.requeue: + route_name: reliefweb_job_tagger.requeue + base_route: entity.node.canonical + title: Re-queue + weight: 50 diff --git a/html/modules/custom/reliefweb_job_tagger/reliefweb_job_tagger.module b/html/modules/custom/reliefweb_job_tagger/reliefweb_job_tagger.module index a62229f3e..58eb6bca1 100644 --- a/html/modules/custom/reliefweb_job_tagger/reliefweb_job_tagger.module +++ b/html/modules/custom/reliefweb_job_tagger/reliefweb_job_tagger.module @@ -5,6 +5,7 @@ * OCHA AI Job tagging. */ +use Drupal\Core\Cache\RefinableCacheableDependencyInterface; use Drupal\Core\Entity\EntityInterface; use Drupal\Core\Form\FormStateInterface; use Drupal\node\Entity\Node; @@ -222,9 +223,35 @@ function reliefweb_job_tagger_node_presave(EntityInterface $node) { if ($node->hasField('reliefweb_job_tagger_status')) { reliefweb_job_tagger_log_manual_changes_to_tagging($node); + // Already queued, nothing left to do. if ($node->reliefweb_job_tagger_status->value == 'queued' || $node->reliefweb_job_tagger_status->value == 'processed') { return; } + + // Something went wrong, retry. + if ($node->reliefweb_job_tagger_status->value == 'skipped') { + $queue_count = 1; + if ($node->hasField('field_job_tagger_queue_count')) { + $queue_count = $node->get('field_job_tagger_queue_count')->value ?? 1; + } + if ($queue_count >= 3) { + $node->set('reliefweb_job_tagger_status', 'processed'); + + $log_message = $node->getRevisionLogMessage(); + $log_message .= (empty($log_message) ? '' : ' ') . 'Job has been queued 3 times, maximum reached.'; + $node->setRevisionLogMessage($log_message); + } + else { + $node->set('reliefweb_job_tagger_status', 'queued'); + + $log_message = $node->getRevisionLogMessage(); + $log_message .= (empty($log_message) ? '' : ' ') . 'Job has been re-queued for tagging.'; + $node->setRevisionLogMessage($log_message); + } + $node->set('field_job_tagger_queue_count', $queue_count++); + + return; + } } if ($node->moderation_status->value != 'pending') { @@ -261,6 +288,40 @@ function reliefweb_job_tagger_node_presave(EntityInterface $node) { } } +/** + * Implements hook_menu_local_tasks_alter(). + */ +function reliefweb_job_tagger_menu_local_tasks_alter(&$data, $route_name, RefinableCacheableDependencyInterface &$cacheability) { + if (!isset($data['tabs'][0]['reliefweb_job_tagger.requeue'])) { + return; + } + + if ($route_name !== 'entity.node.canonical') { + $data['tabs'][0]['reliefweb_job_tagger.requeue']['#access'] = FALSE; + return; + } + + /** @var \Drupal\node\Entity\Node $node */ + $node = \Drupal::routeMatch()->getParameter('node'); + + if ($node->bundle() != 'job') { + $data['tabs'][0]['reliefweb_job_tagger.requeue']['#access'] = FALSE; + return; + } + + if ($node->reliefweb_job_tagger_status->value != 'skipped') { + $data['tabs'][0]['reliefweb_job_tagger.requeue']['#access'] = FALSE; + return; + } + + // Check permissions. + $user = \Drupal::currentUser(); + if (!$user->hasPermission('ocha ai job tag requeue job')) { + $data['tabs'][0]['reliefweb_job_tagger.requeue']['#access'] = FALSE; + return; + } +} + /** * Check for manual changes to the AI tagging and log them. * @@ -318,7 +379,6 @@ function reliefweb_job_tagger_entity_after_save(EntityInterface $entity) { } $entity_type_id = $entity->getEntityTypeId(); - if ($entity_type_id !== 'node') { return; } @@ -326,6 +386,12 @@ function reliefweb_job_tagger_entity_after_save(EntityInterface $entity) { /** @var \Drupal\node\Entity\Node $entity */ if ($entity->hasField('reliefweb_job_tagger_status') && $entity->get('reliefweb_job_tagger_status')->value == 'queued') { reliefweb_job_tagger_queue_job($entity); + return; + } + + if ($entity->bundle() == 'job' && $entity->isPublished()) { + // Embed vector. + reliefweb_job_tagger_index_embedding($entity->id()); } } @@ -340,3 +406,12 @@ function reliefweb_job_tagger_queue_job(Node $job) : void { $item->nid = $job->id(); $queue->createItem($item); } + +/** + * Index embedding for a job. + */ +function reliefweb_job_tagger_index_embedding($nid) { + /** @var \Drupal\ocha_ai_tag\Services\OchaAiTagTagger $jobTagger */ + $jobTagger = \Drupal::service('ocha_ai_tag.tagger'); + $jobTagger->embedDocument($nid); +} diff --git a/html/modules/custom/reliefweb_job_tagger/reliefweb_job_tagger.permissions.yml b/html/modules/custom/reliefweb_job_tagger/reliefweb_job_tagger.permissions.yml index 00fcf48ce..38414fd16 100644 --- a/html/modules/custom/reliefweb_job_tagger/reliefweb_job_tagger.permissions.yml +++ b/html/modules/custom/reliefweb_job_tagger/reliefweb_job_tagger.permissions.yml @@ -9,3 +9,7 @@ bypass ocha ai job tag: test ocha ai job tag: title: 'Test OCHA AI Job tag' description: 'Allow users to use a form to test job tagging.' + +ocha ai job tag requeue job: + title: 'Re-queue job for AI tagging' + description: 'Allow users to re-queue a job for AI tagging.' diff --git a/html/modules/custom/reliefweb_job_tagger/reliefweb_job_tagger.routing.yml b/html/modules/custom/reliefweb_job_tagger/reliefweb_job_tagger.routing.yml index bb337e1a7..3a9aa96d8 100644 --- a/html/modules/custom/reliefweb_job_tagger/reliefweb_job_tagger.routing.yml +++ b/html/modules/custom/reliefweb_job_tagger/reliefweb_job_tagger.routing.yml @@ -1,7 +1,29 @@ test_ocha_ai_job_tag: - path: '/admin/test-job-tagger' + path: '/admin/ai' defaults: - _form: '\Drupal\reliefweb_job_tagger\Form\RwJobTagger' + _controller: '\Drupal\system\Controller\SystemController::systemAdminMenuBlockPage' _title: 'Test job tagger' requirements: _permission: 'test ocha ai job tag' +test_ocha_ai_job_tag_category: + path: '/admin/ai/test-job-tagger-category' + defaults: + _form: '\Drupal\reliefweb_job_tagger\Form\RwJobTaggerCategory' + _title: 'Test job tagger - Category' + requirements: + _permission: 'test ocha ai job tag' +test_ocha_ai_job_tag_theme: + path: '/admin/ai/test-job-tagger-theme' + defaults: + _form: '\Drupal\reliefweb_job_tagger\Form\RwJobTaggerTheme' + _title: 'Test job tagger - Theme' + requirements: + _permission: 'test ocha ai job tag' +reliefweb_job_tagger.requeue: + path: '/node/{node}/re-queue' + defaults: + _form: '\Drupal\reliefweb_job_tagger\Form\RwJobTaggerRequeueForm' + _title: 'Requeue job' + requirements: + _permission: 'ocha ai job tag requeue job' + id: ^\d+$ diff --git a/html/modules/custom/reliefweb_job_tagger/src/Commands/ReliefJobTaggerCommands.php b/html/modules/custom/reliefweb_job_tagger/src/Commands/ReliefJobTaggerCommands.php new file mode 100644 index 000000000..52b290656 --- /dev/null +++ b/html/modules/custom/reliefweb_job_tagger/src/Commands/ReliefJobTaggerCommands.php @@ -0,0 +1,88 @@ +ochaTagger->clearIndex(); + } + + /** + * Index jobs. + * + * @command reliefweb-jobtagger:index-jobs + * + * @usage reliefweb-jobtagger:index-jobs + * Create vector index for jobs. + * + * @validate-module-enabled reliefweb_job_tagger + * + * @aliases rw-job:index + */ + public function indexJobs(array $options = ['id' => 0]) { + // Index single job. + if (isset($options['id']) && !empty($options['id'])) { + $id = $options['id']; + $this->output->writeln('Processing ' . $id); + $this->ochaTagger->embedDocument($id); + return; + } + + // Only index jobs approved by reliefweb.int editors. + $uids = $this->entityTypeManager->getStorage('user') + ->getQuery() + ->accessCheck(FALSE) + ->condition('status', 1) + ->condition('roles', 'editor') + ->execute(); + + $query = $this->entityTypeManager->getStorage('node') + ->getQuery() + ->accessCheck(FALSE) + ->condition('type', 'job', '=') + ->condition('moderation_status', ['published', 'expired'], 'IN') + ->sort('nid', 'desc'); + + if (!empty($uids)) { + // Limit to documents that have been reviewed by an editor to augment + // the likeliness that the documents were tagged properly. + $query->condition('revision_uid', $uids, 'IN'); + } + + $job_ids = $query->execute() ?? []; + + foreach ($job_ids as $id) { + $this->output->writeln('Processing ' . $id); + $this->ochaTagger->embedDocument($id); + } + } + +} diff --git a/html/modules/custom/reliefweb_job_tagger/src/Form/RwJobTagger.php b/html/modules/custom/reliefweb_job_tagger/src/Form/RwJobTaggerCategory.php similarity index 84% rename from html/modules/custom/reliefweb_job_tagger/src/Form/RwJobTagger.php rename to html/modules/custom/reliefweb_job_tagger/src/Form/RwJobTaggerCategory.php index d4659e4d2..3e3dc16a5 100644 --- a/html/modules/custom/reliefweb_job_tagger/src/Form/RwJobTagger.php +++ b/html/modules/custom/reliefweb_job_tagger/src/Form/RwJobTaggerCategory.php @@ -5,6 +5,7 @@ use Drupal\Core\Entity\EntityTypeManagerInterface; use Drupal\Core\Form\FormBase; use Drupal\Core\Form\FormStateInterface; +use Drupal\node\NodeInterface; use Drupal\ocha_ai_tag\Services\OchaAiTagTagger; use GuzzleHttp\ClientInterface; use Symfony\Component\DependencyInjection\ContainerInterface; @@ -12,7 +13,7 @@ /** * Chat form for the Ocha AI Chat module. */ -class RwJobTagger extends FormBase { +class RwJobTaggerCategory extends FormBase { /** * {@inheritdoc} @@ -64,6 +65,8 @@ public function buildForm(array $form, FormStateInterface $form_state, ?bool $po $this->t('Career category'), $this->t('Feedback (AI)'), $this->t('Feedback (ES)'), + $this->t('Feedback (Vector)'), + $this->t('Product'), $this->t('Info'), ], ]; @@ -89,6 +92,18 @@ public function buildForm(array $form, FormStateInterface $form_state, ?bool $po '#format' => 'markdown', ]; + $form['feedback'][$url]['vector_feedback'] = [ + '#type' => 'processed_text', + '#text' => $data['vector_feedback'], + '#format' => 'markdown', + ]; + + $form['feedback'][$url]['product'] = [ + '#type' => 'processed_text', + '#text' => $data['product'], + '#format' => 'markdown', + ]; + $form['feedback'][$url]['info'] = [ '#type' => 'processed_text', '#text' => $data['info'], @@ -150,6 +165,12 @@ public function buildForm(array $form, FormStateInterface $form_state, ?bool $po ]; } + $form['reset_cache'] = [ + '#type' => 'checkbox', + '#title' => $this->t('Reset cache'), + '#default_value' => FALSE, + ]; + $form['submit'] = [ '#type' => 'submit', '#value' => $this->t('Analyze jobs'), @@ -168,7 +189,10 @@ public function submitForm(array &$form, FormStateInterface $form_state): void { $definitions = $form_state->getValue('definitions', []); $form_state->set('definitions', $definitions); - $this->setTermMapping($definitions); + + if ($form_state->getValue('reset_cache', FALSE)) { + $this->setTermMapping($definitions); + } $results = []; $urls = $form_state->getValue('urls', ''); @@ -208,7 +232,11 @@ public function submitForm(array &$form, FormStateInterface $form_state): void { } // Get ES feedback. - $es = $this->getMostRelevantTermsFromEs('jobs', $node->id(), $api_fields, 50); + $es = $this->getMostRelevantTermsFromEs('jobs', [ + 'id' => $node->id(), + 'title' => $node->getTitle(), + 'body' => $node->body->value, + ], $api_fields, 50); $es = $es['career_category'] ?? []; $es_first = ''; @@ -227,6 +255,11 @@ public function submitForm(array &$form, FormStateInterface $form_state): void { } } + // Do vector search on ES. + $similar_feedback = ''; + $similar = $this->getSimilarJobs($node); + $similar_feedback = $this->setAiFeedback($similar); + // Get AI feedback. $text = $node->getTitle() . "\n\n" . $node->get('body')->value; $ai = $this->processDoc($text, $definitions); @@ -236,15 +269,16 @@ public function submitForm(array &$form, FormStateInterface $form_state): void { $info[] = '- AI and ES agree'; } + $mult = []; $intersect = array_intersect_key($es, $ai); if (!empty($intersect)) { - // Multiple confidence levels. - $mult = []; + // Multiple confidence levels, if not defined fall back to 20%. foreach (array_keys($ai) as $key) { - if (array_key_exists($key, $es)) { - $mult[$key] = $ai[$key] * $es[$key] * 100; - } + $mult[$key] = $ai[$key] * ($es[$key] ?? .2); + $mult[$key] = $ai[$key] * ($similar[$key] ?? .2); } + + // Sort reversed and select first. arsort($mult); $info[] = '- First in common: ' . array_key_first($mult); } @@ -253,6 +287,8 @@ public function submitForm(array &$form, FormStateInterface $form_state): void { 'category' => $category, 'feedback' => $this->setAiFeedback($ai, 10), 'es_feedback' => $es_feedback, + 'vector_feedback' => $similar_feedback, + 'product' => $this->setAiFeedback($mult), 'info' => implode("\n", $info), ]; } @@ -265,7 +301,7 @@ public function submitForm(array &$form, FormStateInterface $form_state): void { * {@inheritdoc} */ public function getFormId(): string { - return 'rw_job_tagger'; + return 'rw_job_tagger_category'; } /** @@ -517,4 +553,41 @@ public function getMostRelevantTermsFromEs( return $vocabularies; } + /** + * Get similar jobs. + */ + protected function getSimilarJobs(NodeInterface $node) { + $nid = $node->id(); + $relevant = $this->ochaTagger->getSimilarDocuments($nid, $node->get('body')->value); + if (empty($relevant)) { + return []; + } + + $max = reset($relevant); + + /** @var \Drupal\node\Entity\Node[] $nodes */ + $nodes = $this->entityTypeManager->getStorage('node')->loadMultiple(array_keys($relevant)); + + if (isset($nodes[$nid])) { + unset($nodes[$nid]); + } + + $categories = []; + foreach ($nodes as $node) { + if ($node->hasField('field_career_categories') && !$node->get('field_career_categories')->isEmpty()) { + if (!isset($categories[$node->get('field_career_categories')->entity->label()])) { + $categories[$node->get('field_career_categories')->entity->label()] = ($relevant[$node->id()] ?? .1) / $max; + } + else { + $categories[$node->get('field_career_categories')->entity->label()] *= ($relevant[$node->id()] ?? .1) / $max; + } + } + } + + // Sort reversed by count. + arsort($categories); + + return $categories; + } + } diff --git a/html/modules/custom/reliefweb_job_tagger/src/Form/RwJobTaggerRequeueForm.php b/html/modules/custom/reliefweb_job_tagger/src/Form/RwJobTaggerRequeueForm.php new file mode 100644 index 000000000..3e2d75d31 --- /dev/null +++ b/html/modules/custom/reliefweb_job_tagger/src/Form/RwJobTaggerRequeueForm.php @@ -0,0 +1,98 @@ +t('Are you sure you want to re-queue the job?'); + } + + /** + * {@inheritdoc} + */ + public function getCancelUrl() { + return new Url('entity.node.canonical', ['node' => $this->node->id()]); + } + + /** + * {@inheritdoc} + */ + public function getConfirmText() { + return $this->t('Re-queue job'); + } + + /** + * {@inheritdoc} + */ + public function buildForm(array $form, FormStateInterface $form_state, NodeInterface $node = NULL) { + if ($node->bundle() != 'job') { + $this->messenger()->addWarning($this->t('Only jobs can be re-queued.')); + $form_state->setRedirect('entity.node.canonical', ['node' => $node->id()]); + return; + } + + if ($node->reliefweb_job_tagger_status->value != 'skipped') { + $this->messenger()->addWarning($this->t('Jobs does not need to be re-queued.')); + $form_state->setRedirect('entity.node.canonical', ['node' => $node->id()]); + return; + } + + // Check permissions. + $user = \Drupal::currentUser(); + if (!$user->hasPermission('ocha ai job tag requeue job')) { + $data['tabs'][0]['reliefweb_job_tagger.requeue']['#access'] = FALSE; + return; + } + + $this->node = $node; + $form = parent::buildForm($form, $form_state); + + return $form; + } + + /** + * {@inheritdoc} + */ + public function submitForm(array &$form, FormStateInterface $form_state) { + $node = $this->node; + + $node->set('reliefweb_job_tagger_status', 'queued'); + $node->set('field_job_tagger_queue_count', 1); + + $log_message = $node->getRevisionLogMessage(); + $log_message .= (empty($log_message) ? '' : ' ') . 'Job has been manually queued for tagging.'; + $node->setRevisionLogMessage($log_message); + $node->save(); + + $this->messenger()->addMessage($this->t('Job has been re-queued for AI tagging.')); + + $form_state->setRedirect('entity.node.canonical', ['node' => $this->node->id()]); + } + +} diff --git a/html/modules/custom/reliefweb_job_tagger/src/Form/RwJobTaggerTheme.php b/html/modules/custom/reliefweb_job_tagger/src/Form/RwJobTaggerTheme.php new file mode 100644 index 000000000..273531a14 --- /dev/null +++ b/html/modules/custom/reliefweb_job_tagger/src/Form/RwJobTaggerTheme.php @@ -0,0 +1,591 @@ +get('entity_type.manager'), + $container->get('ocha_ai_tag.tagger'), + $container->get('http_client'), + ); + } + + /** + * {@inheritdoc} + */ + public function buildForm(array $form, FormStateInterface $form_state, ?bool $popup = NULL): array { + $intro = [ + 'On this page you can test how the AI Job Tagger will classify jobs, based on the key phrases defined for each theme.', + '', + '## Steps', + '1. Select one or more URL\s', + '2. Adapt the key phrases', + '3. Analyze the jobs', + '4. Review you the feedback', + '5. Adapt the key phrases (if needed)', + '', + 'Keep in mind that all changes will **NOT** be saved.', + ]; + $form['intro'] = [ + '#type' => 'processed_text', + '#text' => implode("\n", $intro), + '#format' => 'markdown', + ]; + + if ($feedback = $form_state->get('feedback')) { + $form['feedback'] = [ + '#type' => 'table', + '#header' => [ + $this->t('Url'), + $this->t('Theme'), + $this->t('Feedback (AI)'), + $this->t('Feedback (ES)'), + $this->t('Feedback (Vector)'), + $this->t('Product'), + $this->t('Info'), + ], + ]; + + foreach ($feedback as $url => $data) { + $form['feedback'][$url]['url'] = [ + '#markup' => $url, + ]; + + $form['feedback'][$url]['theme'] = [ + '#markup' => $data['theme'], + ]; + + $form['feedback'][$url]['feedback'] = [ + '#type' => 'processed_text', + '#text' => $data['feedback'], + '#format' => 'markdown', + ]; + + $form['feedback'][$url]['es_feedback'] = [ + '#type' => 'processed_text', + '#text' => $data['es_feedback'], + '#format' => 'markdown', + ]; + + $form['feedback'][$url]['vector_feedback'] = [ + '#type' => 'processed_text', + '#text' => $data['vector_feedback'], + '#format' => 'markdown', + ]; + + $form['feedback'][$url]['product'] = [ + '#type' => 'processed_text', + '#text' => $data['product'], + '#format' => 'markdown', + ]; + + $form['feedback'][$url]['info'] = [ + '#type' => 'processed_text', + '#text' => $data['info'], + '#format' => 'markdown', + ]; + } + } + + $form['urls'] = [ + '#type' => 'textarea', + '#title' => $this->t('Job Urls'), + '#description' => $this->t('Enter one or more Urls to job postings.'), + '#required' => TRUE, + ]; + + $form['definitions'] = [ + '#type' => 'table', + '#header' => [ + $this->t('Theme'), + $this->t('Key phrases'), + ], + ]; + + $definitions = $form_state->get('definitions') ?? []; + if (empty($definitions)) { + $terms = $this->entityTypeManager->getStorage('taxonomy_term')->loadByProperties([ + 'status' => 1, + 'vid' => 'theme', + ]); + + /** @var \Drupal\taxonomy\Entity\Term $term */ + foreach ($terms as $term) { + $definitions[$term->id()] = [ + 'name' => $term->getName(), + 'definition' => $term->get('field_example_job_posting')->value ?? $term->getDescription() ?? $term->getName(), + ]; + } + } + + foreach ($definitions as $id => $definition) { + $form['definitions'][$id]['name'] = [ + '#type' => 'textfield', + '#title' => $this->t('Name'), + '#title_display' => 'hidden', + '#required' => TRUE, + '#value' => $definition['name'], + '#disabled' => TRUE, + '#atttibutes' => [ + 'disabled' => 'disabled', + 'readonly' => 'readonly', + ], + ]; + $form['definitions'][$id]['definition'] = [ + '#type' => 'textarea', + '#title' => $this->t('Definition'), + '#title_display' => 'hidden', + '#required' => TRUE, + '#value' => $definition['definition'], + ]; + } + + $form['reset_cache'] = [ + '#type' => 'checkbox', + '#title' => $this->t('Reset cache'), + '#default_value' => FALSE, + ]; + + $form['submit'] = [ + '#type' => 'submit', + '#value' => $this->t('Analyze jobs'), + ]; + + return $form; + } + + /** + * {@inheritdoc} + */ + public function submitForm(array &$form, FormStateInterface $form_state): void { + $api_fields = [ + 'theme' => 'theme', + ]; + + $definitions = $form_state->getValue('definitions', []); + $form_state->set('definitions', $definitions); + + if ($form_state->getValue('reset_cache', FALSE)) { + $this->setTermMapping($definitions); + } + + $results = []; + $urls = $form_state->getValue('urls', ''); + $urls = explode("\n", $urls); + + foreach ($urls as $url) { + $url = trim($url); + $path = parse_url($url, PHP_URL_PATH); + $parts = explode('/', $path); + + if (!isset($parts[2]) || !is_numeric($parts[2])) { + $results[$url] = [ + 'theme' => '', + 'feedback' => 'Skipped, use URL like https://reliefweb.int/job/4064890/country-director-haiti', + ]; + continue; + } + + $nid = $parts[2]; + /** @var \Drupal\node\Entity\Node $node */ + $node = $this->entityTypeManager->getStorage('node')->load($nid); + if (!$node) { + $results[$url] = [ + 'theme' => '', + 'feedback' => 'Skipped, unable to load', + ]; + continue; + } + + $info = []; + + // Get field data. + $categories = $node->get('field_theme')->referencedEntities(); + $theme = ''; + if ($categories) { + $theme = $categories[0]->label(); + } + + // Get ES feedback. + // Doc isn't indexed yet. + $es = $this->getMostRelevantTermsFromEs('jobs', [ + 'id' => $node->id(), + 'title' => $node->getTitle(), + 'body' => $node->body->value, + ], $api_fields, 50); + $es = $es['theme'] ?? []; + + $es_first = ''; + $ai_first = ''; + $es_feedback = ''; + + if (!empty($es) && isset($es)) { + $es_feedback = $this->setAiFeedback($es); + $es_first = array_key_first($es); + $first = reset($es); + if ($first > .70) { + $info[] = '- High ES confidence, skip AI'; + } + elseif ($first > .50) { + $info[] = '- Average ES confidence'; + } + } + + // Do vector search on ES. + $similar_feedback = ''; + $similar = $this->getSimilarJobs($node); + $similar_feedback = $this->setAiFeedback($similar); + + // Get AI feedback. + $text = $node->getTitle() . "\n\n" . $node->get('body')->value; + $ai = $this->processDoc($text, $definitions); + $ai_first = array_key_first($ai); + + if ($ai_first == $es_first) { + $info[] = '- AI and ES agree'; + } + + $mult = []; + // Multiple confidence levels, if not defined fall back to 20%. + foreach (array_keys($ai) as $key) { + $mult[$key] = $ai[$key] * ($es[$key] ?? .2); + $mult[$key] = $ai[$key] * ($similar[$key] ?? .2); + } + + // Sort reversed and select first. + arsort($mult); + $info[] = '- First in common: ' . array_key_first($mult); + + $results[$url] = [ + 'theme' => $theme, + 'feedback' => $this->setAiFeedback($ai, 10), + 'es_feedback' => $es_feedback, + 'vector_feedback' => $similar_feedback, + 'product' => $this->setAiFeedback($mult), + 'info' => implode("\n", $info), + ]; + } + + $form_state->set('feedback', $results); + $form_state->setRebuild(TRUE); + } + + /** + * {@inheritdoc} + */ + public function getFormId(): string { + return 'rw_job_tagger_theme'; + } + + /** + * Set term mapping. + */ + protected function setTermMapping(array $definitions) : void { + $mapping = [ + 'theme' => [], + ]; + + foreach ($definitions as $definition) { + $mapping['theme'][$definition['name']] = $definition['definition']; + } + + $term_cache_tags = []; + + $this->ochaTagger + ->setVocabularies($mapping, $term_cache_tags) + ->clearCache(); + } + + /** + * Analyze document. + */ + protected function processDoc(string $text) : array { + $data = $this->ochaTagger + ->tag($text, [OchaAiTagTagger::CALCULATION_METHOD_MEAN_WITH_CUTOFF], OchaAiTagTagger::AVERAGE_FULL_AVERAGE); + + $data = $data[OchaAiTagTagger::AVERAGE_FULL_AVERAGE][OchaAiTagTagger::CALCULATION_METHOD_MEAN_WITH_CUTOFF]; + + return $data['theme'] ?? []; + } + + /** + * Construct AI feedback message. + */ + protected function setAiFeedback($data, $limit = 5) { + $message = []; + + // Max n items. + $items = array_slice($data, 0, $limit); + + foreach ($items as $key => $confidence) { + $message[] = '- ' . $key . ': ' . floor(100 * $confidence) . '%' . "\n"; + } + + return implode('', $message); + } + + /** + * Generate a list of terms sorted by relevance to a document. + * + * We retrieve a list of the most similar documents using a "more like this" + * query on the Elasticsearch index, extract the terms from the documents + * and sort them using the "similarity" score from the "more like this" query. + * + * @param string $resource + * The API resource. + * @param int|array $document + * Either the document ID if the document is already indexed or an + * associative array with the document's title and description. + * @param array $fields + * Associative array with the vocabularies to retrieve keyed by the + * corresponding Elasticsearch fields. + * @param int $limit + * Maxium number of similar documents to retrieve. Defaults to 10. + * @param array $parameters + * Parameters for the more like this query. + * + * @return array + * Associative array keyed by vocabulary with maps of term to relevance as + * values. + */ + public function getMostRelevantTermsFromEs( + string $resource, + int|array $document, + array $fields, + int $limit = 10, + // This can be adjusted but seems to give good results. + array $parameters = [ + 'min_term_freq' => 3, + 'min_word_length' => 4, + 'min_doc_freq' => 4, + 'max_query_terms' => 40, + 'boost_terms' => 10, + 'minimum_should_match' => '60%', + ], + ): array { + $index = $this->config('reliefweb_api.settings')->get('base_index_name') . '_' . $resource; + $url = $this->config('reliefweb_api.settings')->get('elasticsearch') . '/' . $index . '/_search'; + + // If the document is indexed, we can simply use it's ID. + if (is_int($document)) { + $entity_id = (int) $document; + $like = [ + '_id' => $entity_id, + ]; + + // This filter is either the given document or published or expired + // documents. This ensures the given document is returned so we can + // normalize the scores. + $filter = [ + [ + 'bool' => [ + 'should' => [ + [ + 'term' => [ + 'id' => $document, + ], + ], + [ + 'terms' => [ + 'status' => ['published', 'expired'], + ], + ], + ], + ], + ], + ]; + } + // Otherwise we pass the title and body and let Elasticsearch analyze those + // as if they were to be indexed. + elseif (isset($document['id'], $document['title'], $document['body'])) { + $entity_id = (int) $document['id']; + $like = [ + 'doc' => [ + 'id' => $entity_id, + 'title' => $document['title'], + 'body' => $document['body'], + 'status' => 'published', + ], + ]; + + // Here, we can only filter on the published/expired documents. + $filter = [ + [ + 'terms' => [ + 'status' => ['published', 'expired'], + ], + ], + ]; + } + else { + return []; + } + + $payload = [ + 'query' => [ + 'bool' => [ + 'must' => [ + [ + 'more_like_this' => [ + // Only compare the title and body. We could extend that to + // include the source etc. but that's more similar to the data + // used for the embeddings comparison. + 'fields' => ['title', 'body'], + 'like' => [ + [ + '_index' => $index, + ] + $like, + ], + // This is important: Elasticsearch returns scores that relative + // to the current search query so to have some comparable we + // need to normalize the scores. With the following parameter, + // the document is included in the results (= the first item in + // the results with the higher score). We can then use this max + // score to normalize the other scores. + 'include' => TRUE, + ] + $parameters, + ], + ], + // We can only filter on the status. The reviewer user ID is not in + // the API so we cannot limit to similar documents reviewed by + // editors. That means the list of similar documents may include + // documents from trusted users with possibly less correct term + // selection. + 'filter' => $filter, + ], + ], + '_source' => array_merge(['id'], array_keys($fields)), + // The document to compare is included so we increase the limit by 1 to + // account for that. + 'size' => $limit + 1, + ]; + + try { + $response = $this->httpClient->post($url, [ + 'json' => $payload, + ]); + } + catch (\Exception $exception) { + $this->logger('reliefweb_job_tagger')->error($exception->getMessage()); + return []; + } + + if ($response->getStatusCode() !== 200) { + $this->logger('reliefweb_job_tagger')->error($response->getStatusCode() . ': ' . ($response->getBody()?->getContents() ?? '')); + return []; + } + + $data = json_decode($response->getBody()->getContents(), TRUE); + if (empty($data['hits']['hits'])) { + $this->logger('reliefweb_job_tagger')->warning('No hits found.'); + return []; + } + + // Aggregate the scores for each term of each vocabulary. + $max_score = 0; + $vocabularies = []; + foreach ($data['hits']['hits'] as $item) { + $source = $item['_source']; + $score = $item['_score']; + $id = (int) $source['id']; + + if ($score > $max_score) { + $max_score = $score; + } + + // Skip the original document because it's only included to normalize the + // similarity scores of the other documents. + if ($id === $entity_id) { + continue; + } + + // Aggregate the scores for each term of each vocabulary. + foreach ($fields as $field => $vocabulary) { + $terms = []; + if (!empty($source[$field])) { + foreach ($source[$field] as $term) { + $vocabularies[$vocabulary][$term['name']][] = $score; + } + } + } + } + + if (empty($vocabularies)) { + return []; + } + + // Calculate the nornmalized mean of the scores for each term. + foreach ($vocabularies as $vocabulary => $terms) { + foreach ($terms as $term => $scores) { + $vocabularies[$vocabulary][$term] = array_sum($scores) / $max_score / count($scores); + } + // Sort the terms by relevance. + arsort($vocabularies[$vocabulary]); + } + + return $vocabularies; + } + + /** + * Get similar jobs. + */ + protected function getSimilarJobs(NodeInterface $node) { + $nid = $node->id(); + $relevant = $this->ochaTagger->getSimilarDocuments($nid, $node->get('body')->value); + if (empty($relevant)) { + return []; + } + + $max = reset($relevant); + + /** @var \Drupal\node\Entity\Node[] $nodes */ + $nodes = $this->entityTypeManager->getStorage('node')->loadMultiple(array_keys($relevant)); + + if (isset($nodes[$nid])) { + unset($nodes[$nid]); + } + + $categories = []; + foreach ($nodes as $node) { + if ($node->hasField('field_theme') && !$node->get('field_theme')->isEmpty()) { + if (!isset($categories[$node->get('field_theme')->entity->label()])) { + $categories[$node->get('field_theme')->entity->label()] = ($relevant[$node->id()] ?? .1) / $max; + } + else { + $categories[$node->get('field_theme')->entity->label()] *= ($relevant[$node->id()] ?? .1) / $max; + } + } + } + + // Sort reversed by count. + arsort($categories); + + return $categories; + } + +} diff --git a/html/modules/custom/reliefweb_job_tagger/src/Plugin/QueueWorker/OchaAiJobTagTaggerWorker.php b/html/modules/custom/reliefweb_job_tagger/src/Plugin/QueueWorker/OchaAiJobTagTaggerWorker.php index af11cf21a..e2649a047 100644 --- a/html/modules/custom/reliefweb_job_tagger/src/Plugin/QueueWorker/OchaAiJobTagTaggerWorker.php +++ b/html/modules/custom/reliefweb_job_tagger/src/Plugin/QueueWorker/OchaAiJobTagTaggerWorker.php @@ -8,6 +8,7 @@ use Drupal\Core\Logger\LoggerChannelInterface; use Drupal\Core\Plugin\ContainerFactoryPluginInterface; use Drupal\Core\Queue\QueueWorkerBase; +use Drupal\node\NodeInterface; use Drupal\ocha_ai_tag\Services\OchaAiTagTagger; use GuzzleHttp\ClientInterface; use Symfony\Component\DependencyInjection\ContainerInterface; @@ -120,17 +121,20 @@ public function processItem($data) { } if ($node->body->isEmpty()) { + $this->setJobStatus($node, 'No body text present, AI skipped'); $this->logger->warning('No body text present for node @nid, skipping', ['@nid' => $nid]); return; } // Only process it when fields are empty. if (!$node->field_career_categories->isEmpty()) { + $this->setJobStatus($node, 'Category already specified, AI skipped'); $this->logger->warning('Category already specified for node @nid, skipping', ['@nid' => $nid]); return; } if (!$node->field_theme->isEmpty()) { + $this->setJobStatus($node, 'Theme(s) already specified, AI skipped'); $this->logger->warning('Theme(s) already specified for node @nid, skipping', ['@nid' => $nid]); return; } @@ -167,6 +171,8 @@ public function processItem($data) { ->tag($text, [OchaAiTagTagger::CALCULATION_METHOD_MEAN_WITH_CUTOFF], OchaAiTagTagger::AVERAGE_FULL_AVERAGE); } catch (\Exception $exception) { + $this->setJobStatus($node, 'AI tagging failed, AI skipped', FALSE); + $this->logger->error('Tagging exception for node @nid: @error', [ '@nid' => $nid, '@error' => strtr($exception->getMessage(), "\n", " "), @@ -175,11 +181,13 @@ public function processItem($data) { } if (empty($data)) { + $this->setJobStatus($node, 'AI tagging failed, AI skipped', FALSE); $this->logger->error('No data received from AI for node @nid', ['@nid' => $nid]); return; } if (!isset($data[OchaAiTagTagger::AVERAGE_FULL_AVERAGE][OchaAiTagTagger::CALCULATION_METHOD_MEAN_WITH_CUTOFF])) { + $this->setJobStatus($node, 'AI tagging failed, AI skipped', FALSE); $this->logger->error('Data "average mean with cutoff" missing from AI for node @nid', ['@nid' => $nid]); return; } @@ -189,6 +197,29 @@ public function processItem($data) { $message = []; $needs_save = FALSE; + $use_es = $this->configFactory->get('reliefweb_job_tagger.settings')->get('use_es', FALSE); + $es = []; + $es_terms = [ + 'career_categories' => [], + 'theme' => [], + ]; + if ($use_es) { + // Get ES feedback. + $api_fields = [ + 'career_categories' => 'career_category', + 'theme' => 'theme', + ]; + // Doc isn't indexed yet. + $es = $this->getMostRelevantTermsFromEs('jobs', [ + 'id' => $node->id(), + 'title' => $node->getTitle(), + 'body' => $node->body->value, + ], $api_fields, 50); + + $es_terms['career_category'] = $this->getRelevantTerm('career_category', $es['career_category'] ?? [], 1); + $es_terms['theme'] = $this->getRelevantTerm('theme', $es['theme'] ?? [], 3); + } + if (isset($data['career_category']) && $node->field_career_categories->isEmpty()) { $ai_term = $this->getRelevantTerm('career_category', $data['career_category'], 1); $message[] = $this->setAiFeedback('Career category (AI)', $data['career_category'], [$ai_term]); @@ -196,53 +227,69 @@ public function processItem($data) { $node->set('field_career_categories', $ai_term); $needs_save = TRUE; - $use_es = $this->configFactory->get('reliefweb_job_tagger.settings')->get('use_es', FALSE); if ($use_es) { - // Get ES feedback. - $api_fields = [ - 'career_categories' => 'career_category', - ]; - $es = $this->getMostRelevantTermsFromEs('jobs', $node->id(), $api_fields, 50); + $ai = $data['career_category']; $es = $es['career_category'] ?? []; + $es_term = $es_terms['career_category'] ?? []; + $similar = $this->getSimilarJobs($node, 'field_career_categories'); - $es_term = $this->getRelevantTerm('career_category', $es, 1); $message[] = $this->setAiFeedback('Career category (ES)', $es, [$es_term]); - // Combine both AI and ES. This gives, most of the time, a more accurate - // result. - $ai = $data['career_category']; - $intersect = array_intersect_key($es, $ai); - if (!empty($intersect)) { - // Multiple confidence levels. - $mult = []; - foreach (array_keys($ai) as $key) { - if (array_key_exists($key, $es)) { - $mult[$key] = $ai[$key] * $es[$key]; - } - } - arsort($mult); - - $mult_term = $this->getRelevantTerm('career_category', $mult, 1); - array_unshift($message, $this->setAiFeedback('Career category', $mult, [$mult_term])); + $mult = []; - $node->set('field_career_categories', $mult_term); - $needs_save = TRUE; + // Multiple confidence levels, if not defined fall back to 20%. + foreach (array_keys($ai) as $key) { + $mult[$key] = $ai[$key] * ($es[$key] ?? .2) * ($similar[$key] ?? .2); } - // Use the results of the AI only if there are no results from ES so - // that there is at least consistent feedback (the 3 sections) for the - // editors. - else { - array_unshift($message, $this->setAiFeedback('Career category', $ai, [$ai_term])); + + // Sort reversed and select first. + arsort($mult); + + $mult_term = $this->getRelevantTerm('career_category', $mult, 1); + if (!is_array($mult_term)) { + $mult_term = [$mult_term]; } + array_unshift($message, $this->setAiFeedback('Career category', $mult, $mult_term)); + + $node->set('field_career_categories', $mult_term); + $needs_save = TRUE; } } if (isset($data['theme']) && $node->field_theme->isEmpty()) { $terms = $this->getRelevantTerm('theme', $data['theme'], 3); - $message[] = $this->setAiFeedback('Theme(s)', $data['theme'], $terms); + $message[] = $this->setAiFeedback('Themes (AI)', $data['theme'], $terms); $node->set('field_theme', $terms); $needs_save = TRUE; + + if ($use_es) { + $ai = $data['theme']; + $es = $es['theme'] ?? []; + $es_term = $es_terms['theme'] ?? []; + $similar = $this->getSimilarJobs($node, 'field_theme'); + + $message[] = $this->setAiFeedback('Themes (ES)', $es, $es_term); + + $mult = []; + + // Multiple confidence levels, if not defined fall back to 20%. + foreach (array_keys($ai) as $key) { + $mult[$key] = $ai[$key] * ($es[$key] ?? .2) * ($similar[$key] ?? .2); + } + + // Sort reversed and select first. + arsort($mult); + + $mult_term = $this->getRelevantTerm('theme', $mult, 3); + if (!is_array($mult_term)) { + $mult_term = [$mult_term]; + } + array_unshift($message, $this->setAiFeedback('Themes', $mult, $mult_term)); + + $node->set('field_theme', $mult_term); + $needs_save = TRUE; + } } if ($needs_save) { @@ -573,4 +620,61 @@ public function getMostRelevantTermsFromEs( return $vocabularies; } + /** + * Get similar jobs. + */ + protected function getSimilarJobs(NodeInterface $node, string $field) { + $nid = $node->id(); + $relevant = $this->jobTagger->getSimilarDocuments($nid, $node->get('body')->value); + if (empty($relevant)) { + return []; + } + + $max = reset($relevant); + + /** @var \Drupal\node\Entity\Node[] $nodes */ + $nodes = $this->entityTypeManager->getStorage('node')->loadMultiple(array_keys($relevant)); + + if (isset($nodes[$nid])) { + unset($nodes[$nid]); + } + + $terms = []; + foreach ($nodes as $node) { + if ($node->hasField($field) && !$node->get($field)->isEmpty()) { + foreach ($node->get($field)->referencedEntities() as $term) { + if (!isset($terms[$term->label()])) { + $terms[$term->label()] = ($relevant[$node->id()] ?? .1) / $max; + } + else { + $terms[$term->label()] *= ($relevant[$node->id()] ?? .1) / $max; + } + } + } + } + + // Sort reversed by count. + arsort($terms); + + return $terms; + } + + /** + * Set job status and revision log. + */ + protected function setJobStatus(NodeInterface &$node, string $message, bool $permanent = TRUE) { + $node->setRevisionCreationTime(time()); + $node->setRevisionLogMessage($message); + + if ($permanent) { + $node->set('reliefweb_job_tagger_status', 'processed'); + } + else { + $node->set('reliefweb_job_tagger_status', 'skipped'); + } + + $node->setNewRevision(TRUE); + $node->save(); + } + } diff --git a/html/themes/custom/common_design_subtheme/templates/user/admin-menu.html.twig b/html/themes/custom/common_design_subtheme/templates/user/admin-menu.html.twig index 8f94fc025..e1224b087 100644 --- a/html/themes/custom/common_design_subtheme/templates/user/admin-menu.html.twig +++ b/html/themes/custom/common_design_subtheme/templates/user/admin-menu.html.twig @@ -49,6 +49,7 @@
  • Taxonomy
  • Community topics
  • Guidelines
  • +
  • List stalled AI jobs