Browse Source

fix: schema.org graph silently failing to use new tasks

A new existing page was silently being overwritten by a schema.org page
Added detection for future conflicts with existing pages as well
as conflicts of renamed pages. Renamed conflicting properties
in same way as classes. Also added a debug option to create an edn
file which is idempotent and helpful to confirm there are no
regressions
Gabriel Horner 2 years ago
parent
commit
c4f6873a58
1 changed files with 118 additions and 38 deletions
  1. 118 38
      scripts/src/logseq/tasks/db_graph/create_graph_with_schema_org.cljs

+ 118 - 38
scripts/src/logseq/tasks/db_graph/create_graph_with_schema_org.cljs

@@ -33,20 +33,20 @@
       (throw (ex-info (str "No :block/uuid for " class-id) {}))))
 
 (defn- get-comment-string
-  [rdfs-comment renamed-classes]
+  [rdfs-comment renamed-pages]
   (let [desc* (if (map? rdfs-comment)
                 (get rdfs-comment "@value")
                 rdfs-comment)
         ;; Update refs to renamed classes
-        regex (re-pattern (str "\\[\\[(" (string/join "|" (keys renamed-classes)) ")\\]\\]"))
-        desc (string/replace desc* regex #(str "[[" (get renamed-classes (second %)) "]]"))]
+        regex (re-pattern (str "\\[\\[(" (string/join "|" (keys renamed-pages)) ")\\]\\]"))
+        desc (string/replace desc* regex #(str "[[" (get renamed-pages (second %)) "]]"))]
     ;; Fix markdown and html links to schema website docs
     (string/replace desc #"(\(|\")/docs" "$1https://schema.org/docs")))
 
 (defn- strip-schema-prefix [s]
   (string/replace-first s "schema:" ""))
 
-(defn- ->class-page [class-m class-db-ids class-uuids class-properties property-uuids {:keys [verbose renamed-classes]}]
+(defn- ->class-page [class-m class-db-ids class-uuids class-properties property-uuids {:keys [verbose renamed-classes renamed-pages]}]
   (let [parent-class* (class-m "rdfs:subClassOf")
         parent-class (cond
                        (map? parent-class*)
@@ -73,7 +73,7 @@
              :db/id (get-class-db-id class-db-ids (class-m "@id"))
              :properties (cond-> {:url url}
                            (class-m "rdfs:comment")
-                           (assoc :description (get-comment-string (class-m "rdfs:comment") renamed-classes)))}
+                           (assoc :description (get-comment-string (class-m "rdfs:comment") renamed-pages)))}
       parent-class
       (assoc :block/namespace {:db/id (get-class-db-id class-db-ids parent-class)})
       (seq properties)
@@ -105,7 +105,7 @@
              (when (class-map %) :page))
         range-includes))
 
-(defn- ->property-page [property-m prop-uuid class-map class-uuids {:keys [verbose renamed-classes]}]
+(defn- ->property-page [property-m prop-uuid class-map class-uuids {:keys [verbose renamed-pages renamed-properties]}]
   (let [range-includes (get-range-includes property-m)
         schema-type (get-schema-type range-includes class-map)
         ;; Pick first range to determine type as only one range is supported currently
@@ -113,12 +113,15 @@
             (println "Picked property type:"
                      {:property (property-m "@id") :type schema-type :range-includes (vec range-includes)}))
         _ (assert schema-type (str "No schema found for property " (property-m "@id")))
+        inverted-renamed-properties (set/map-invert renamed-properties)
+        class-name (strip-schema-prefix (property-m "@id"))
+        url (str "https://schema.org/" (get inverted-renamed-properties class-name class-name))
         schema (cond-> {:type schema-type}
                  ;; This cardinality rule should be adjusted as we use schema.org more
                  (= schema-type :page)
                  (assoc :cardinality :many)
                  (property-m "rdfs:comment")
-                 (assoc :description (get-comment-string (property-m "rdfs:comment") renamed-classes))
+                 (assoc :description (get-comment-string (property-m "rdfs:comment") renamed-pages))
                  (= schema-type :page)
                  (assoc :classes (let [invalid-classes (remove class-uuids range-includes)
                                        _ (when (seq invalid-classes)
@@ -131,7 +134,7 @@
     {(keyword (strip-schema-prefix (property-m "@id")))
      {:block/uuid prop-uuid
       :block/schema schema
-      :properties {:url (string/replace-first (property-m "@id") "schema:" "https://schema.org/")}}}))
+      :properties {:url url}}}))
 
 (defn- get-class-to-properties
   "Given a vec of class ids and a vec of properties map to process, return a map of
@@ -162,21 +165,60 @@
     (and (seq range-includes)
          (every? (fn [x] (contains? unsupported-data-types x)) range-includes))))
 
+(defn- get-vector-conflicts
+  "Given a seq of tuples returns a seq of tuples that conflict i.e. their first element
+   has a case insensitive conflict/duplicate with another. An example conflict:
+   [[\"schema:businessFunction\" :property] [\"schema:BusinessFunction\" :class]]"
+  [tuples-seq]
+  (->> tuples-seq
+       (group-by (comp common-util/page-name-sanity-lc first))
+       (filter #(> (count (val %)) 1))
+       vals))
+
+(defn- detect-final-conflicts
+  "Does one final detection for conflicts after everything has been renamed"
+  [all-properties all-classes page-tuples]
+  (let [property-ids (map #(vector (% "@id") :property) all-properties)
+        class-ids (map #(vector (% "@id") :class) all-classes)
+        existing-conflicts (get-vector-conflicts (concat property-ids class-ids page-tuples))]
+    (when (seq existing-conflicts) (prn :CONFLICTS existing-conflicts))
+    (assert (empty? existing-conflicts)
+            "There are no conflicts between existing pages, schema classes and properties")))
+
+(defn- detect-property-conflicts-and-get-renamed-properties
+  "Detects conflicts between properties and existing pages and returns renamed properties"
+  [property-ids existing-pages {:keys [verbose]}]
+  (let [conflicts (get-vector-conflicts (concat property-ids existing-pages))
+        _ (assert (every? #(= 2 (count %)) conflicts) "All conflicts must only be between two elements")
+        renamed-properties (->> conflicts
+                                (map #(-> % second first))
+                                ;; Renaming properties '_property' suffix guarantees uniqueness
+                                ;; b/c schema.org doesn't use '_' in their names
+                                (map #(vector % (str % "_property")))
+                                (into {}))]
+    (if verbose
+      (println "Renaming the following properties because they have names that conflict with Logseq's built in pages"
+               (keys renamed-properties) "\n")
+      (println "Renaming" (count renamed-properties) "properties due to page name conflicts"))
+    renamed-properties))
+
 (defn- detect-id-conflicts-and-get-renamed-classes
-  "Properties and class names conflict in Logseq because schema.org names are
-  case sensitive whereas Logseq's :block/name is case insensitive. This is dealt
-  with by appending a '_Class' suffix to conflicting classes.  If this strategy
-  changes, be sure to update schema->logseq-data-types"
-  [property-ids class-ids {:keys [verbose]}]
-  (let [conflicts
-        (->> (concat property-ids class-ids)
-             (group-by (comp common-util/page-name-sanity-lc first))
-             (filter #(> (count (val %)) 1))
-             vals)
+  "Detects conflicts between classes AND properties and existing
+  pages. Renames any detected conflicts. Properties and class names conflict in
+  Logseq because schema.org names are case sensitive whereas Logseq's
+  :block/name is case insensitive. This is dealt with by appending a '_Class'
+  suffix to conflicting classes.  If this strategy changes, be sure to update
+  schema->logseq-data-types"
+  [property-ids class-ids existing-pages {:keys [verbose]}]
+  (let [conflicts (get-vector-conflicts (concat property-ids class-ids))
         ;; If this assertion fails then renamed-classes approach to resolving
         ;; conflicts may need to be revisited
-        _ (assert (every? #(= (map second %) [:property :class]) conflicts)
-                  "All conflicts are between a property and class")
+        _ (assert (every? #(= 2 (count %)) conflicts) "All conflicts must only be between two elements")
+        existing-conflicts (get-vector-conflicts (concat class-ids existing-pages))
+        _ (when (seq existing-conflicts) (prn :EXISTING-CLASS-CONFLICTS existing-conflicts))
+        ;; Add existing-conflicts to conflicts if this ever fails
+        _ (assert (empty? existing-conflicts)
+                  "There are no conflicts between existing pages and schema classes and properties")
         renamed-classes (->> conflicts
                              (map #(-> % second first))
                              ;; Renaming classes with '_Class' suffix guarantees uniqueness
@@ -187,7 +229,6 @@
       (println "Renaming the following classes because they have property names that conflict with Logseq's case insensitive :block/name:"
                (keys renamed-classes) "\n")
       (println "Renaming" (count renamed-classes) "classes due to page name conflicts"))
-        ;; Looks for all instances of a renamed class and updates them to the renamed class reference
     renamed-classes))
 
 (defn- get-all-properties [schema-data {:keys [verbose]}]
@@ -232,39 +273,47 @@
 
 (defn- get-all-classes-and-properties
   "Get all classes and properties from raw json file"
-  [schema-data options]
+  [schema-data existing-pages options]
   (let [;; TODO: See if it's worth pulling in non-types like schema:MusicReleaseFormatType
         all-classes* (filter #(contains? (set (as-> (% "@type") type'
                                                 (if (string? type') [type'] type')))
                                          "rdfs:Class")
                              schema-data)
         all-properties* (get-all-properties schema-data options)
+        property-tuples (map #(vector (% "@id") :property) all-properties*)
+        class-tuples (map #(vector (% "@id") :class) all-classes*)
+        page-tuples (map #(vector (str "schema:" %) :page) existing-pages)
         renamed-classes (detect-id-conflicts-and-get-renamed-classes
-                         (map #(vector (% "@id") :property) all-properties*)
-                         (map #(vector (% "@id") :class) all-classes*)
-                         options)
-        rename-class-ids (fn [m]
-                           (w/postwalk (fn [x]
-                                         (if-let [new-class (and (map? x) (renamed-classes (x "@id")))]
-                                           (merge x {"@id" new-class})
-                                           x)) m))
+                         property-tuples class-tuples page-tuples options)
+        renamed-properties (detect-property-conflicts-and-get-renamed-properties
+                            property-tuples page-tuples options)
+        renamed-pages (merge renamed-classes renamed-properties)
         ;; Updates keys like @id, @subClassOf
-        all-classes (map rename-class-ids all-classes*)
+        rename-page-ids (fn [m]
+                          (w/postwalk (fn [x]
+                                        (if-let [new-page (and (map? x) (renamed-pages (x "@id")))]
+                                          (merge x {"@id" new-page})
+                                          x)) m))
         ;; Updates keys like @id, @rangeIncludes, @domainIncludes
-        all-properties (map rename-class-ids all-properties*)]
+        all-classes (map rename-page-ids all-classes*)
+        all-properties (map rename-page-ids all-properties*)]
+    (detect-final-conflicts all-properties all-classes page-tuples)
     {:all-classes all-classes
      :all-properties all-properties
+     :renamed-properties (->> renamed-properties
+                              (map (fn [[k v]] [(strip-schema-prefix k) (strip-schema-prefix v)]))
+                              (into {}))
      :renamed-classes (->> renamed-classes
                            (map (fn [[k v]] [(strip-schema-prefix k) (strip-schema-prefix v)]))
                            (into {}))}))
 
-(defn- create-init-data [options]
+(defn- create-init-data [existing-pages options]
   (let [schema-data (-> (str (fs/readFileSync "resources/schemaorg-current-https.json"))
                         js/JSON.parse
                         (js->clj)
                         (get "@graph"))
-        {:keys [all-classes all-properties renamed-classes]}
-        (get-all-classes-and-properties schema-data options)
+        {:keys [all-classes all-properties renamed-classes renamed-properties]}
+        (get-all-classes-and-properties schema-data existing-pages options)
         ;; Generate data shared across pages and properties
         class-map (->> all-classes
                        (map #(vector (% "@id") %))
@@ -283,13 +332,17 @@
         property-uuids (->> select-properties
                             (map #(vector % (random-uuid)))
                             (into {}))
+        options' (assoc options
+                        :renamed-classes renamed-classes
+                        :renamed-properties renamed-properties
+                        :renamed-pages (merge renamed-properties renamed-classes))
         ;; Generate pages and properties
         properties (generate-properties
                     (filter #(contains? select-properties (% "@id")) all-properties)
-                    property-uuids class-map class-uuids (assoc options :renamed-classes renamed-classes))
+                    property-uuids class-map class-uuids options')
         pages (generate-pages
                (map #(class-map %) select-class-ids)
-               class-uuids class-to-properties property-uuids (assoc options :renamed-classes renamed-classes))]
+               class-uuids class-to-properties property-uuids options')]
     {:pages-and-blocks pages
      :properties properties}))
 
@@ -297,9 +350,34 @@
   "Options spec"
   {:help {:alias :h
           :desc "Print help"}
+   :debug {:alias :d
+           :desc "Prints additional debug info and a schema.edn for debugging"}
    :verbose {:alias :v
              :desc "Verbose mode"}})
 
+(defn- write-debug-file [blocks-tx db]
+  (let [block-uuid->name* (->> (d/q '[:find (pull ?b [:block/name :block/uuid]) :where [?b :block/name]] db)
+                               (map first)
+                               (map (juxt :block/uuid :block/name))
+                               (into {}))
+        block-uuid->name #(or (block-uuid->name* %) (throw (ex-info (str "No entity found for " %) {})))
+        ;; TODO: Figure out why some Thing's properties don't exist
+        block-uuid->name-please-fixme
+        #(or (block-uuid->name* %2) (println "WARNING: Page" (pr-str (:block/original-name %1)) "skipped uuid" %2))]
+    (fs/writeFileSync "schema-org.edn"
+                      (pr-str
+                       (->> blocks-tx
+                            (map (fn [m]
+                                   (cond-> (select-keys m [:block/name :block/type :block/original-name
+                                                           :block/properties :block/schema])
+                                     (seq (:block/properties m))
+                                     (update :block/properties #(update-keys % block-uuid->name))
+                                     (seq (get-in m [:block/schema :properties]))
+                                     (update-in [:block/schema :properties] #(mapv (partial block-uuid->name-please-fixme m) %))
+                                     (seq (get-in m [:block/schema :classes]))
+                                     (update-in [:block/schema :classes] #(mapv block-uuid->name %)))))
+                            set)))))
+
 (defn -main [args]
   (let [[graph-dir] args
         options (cli/parse-opts args {:spec spec})
@@ -311,13 +389,15 @@
                         ((juxt node-path/dirname node-path/basename) graph-dir)
                         [(node-path/join (os/homedir) "logseq" "graphs") graph-dir])
         conn (create-graph/init-conn dir db-name)
-        init-data (create-init-data options)
+        init-data (create-init-data (d/q '[:find [?name ...] :where [?b :block/name ?name]] @conn)
+                                    options)
         blocks-tx (create-graph/create-blocks-tx init-data)]
     (println "Generating" (str (count (filter :block/name blocks-tx)) " pages with "
                                (count (:pages-and-blocks init-data)) " classes and "
                                (count (:properties init-data)) " properties ..."))
     (d/transact! conn blocks-tx)
     (when (:verbose options) (println "Transacted" (count (d/datoms @conn :eavt)) "datoms"))
+    (when (:debug options) (write-debug-file blocks-tx @conn))
     (println "Created graph" (str db-name "!"))))
 
 (when (= nbb/*file* (:file (meta #'-main)))