Spaces:

librarian-bots
/

MetaRefine

Runtime error

App Files Files Community

davanstrien HF Staff commited on Jun 30, 2023

Commit

8b71d33

1 Parent(s): 213c06e

tidy cache

Browse files

Files changed (1) hide show

app.py +48 -33

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import datetime
 import os
 import copy
 from dataclasses import asdict, dataclass
@@ -29,8 +28,15 @@ from httpx import Client
 from httpx_caching import CachingClient
 from httpx_caching import OneDayCacheHeuristic
 client = Client()
 client = CachingClient(client, heuristic=OneDayCacheHeuristic())
@@ -57,7 +63,7 @@ def get_model_labels(model):
 class EngagementStats:
     likes: int
     downloads: int
-    created_at: datetime.datetime
 def _get_engagement_stats(hub_id):
@@ -298,6 +304,7 @@ GENERIC_SCORES = generate_common_scores()
 # @cache.memoize(expire=60 * 60 * 24 * 3)  # expires after 3 days
 def _basic_check(hub_id):
     data = ModelMetadata.from_hub(hub_id)
     score = 0
@@ -358,7 +365,7 @@ def create_query_url(query, skip=0):
     return f"https://huggingface.co/api/search/full-text?q={query}&limit=100&skip={skip}&type=model"
-# @cache.memoize(expire=60 * 60 * 24 * 3)  # expires after 3 days
 def get_results(query) -> Dict[Any, Any]:
     url = create_query_url(query)
     r = client.get(url)
@@ -390,8 +397,22 @@ def parse_single_result(result):
     }
 def filter_search_results(
-    results: List[Dict[Any, Any]], min_score=None, min_model_card_length=None
 ):  # TODO make code more intuitive
     results = thread_map(parse_single_result, results)
     for i, parsed_result in tqdm(enumerate(results)):
@@ -418,10 +439,14 @@ def filter_search_results(
                 yield parsed_result
-def sort_search_results(filtered_search_results):
     return sorted(
         list(filtered_search_results),
-        key=lambda x: (x["metadata_score"], x["original_position"]),
         reverse=True,
     )
@@ -435,20 +460,12 @@ def find_context(text, query, window_size):
         # Get the start and end indices of the context window
         start = max(0, index - window_size)
         end = min(len(words), index + window_size + 1)
         return " ".join(words[start:end])
     except ValueError:
         return " ".join(words[:window_size])
-# single_result[
-#     "text"
-# ] = "lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."
-# results = [single_result] * 3
-def create_markdown(results):
     rows = []
     for result in results:
         row = f"""# [{result['name']}]({result['repo_hub_url']})
@@ -490,7 +507,6 @@ def _search_hub(
     # for result in filtered_results:
     #     result_text = httpx.get(result["search_result_file_url"]).text
     #     result["text"] = find_context(result_text, query, 100)
     #     final_results.append(result)
     final_results = thread_map(get_result_card_snippet, filtered_results)
     percent_of_original = round(
@@ -532,23 +548,22 @@ with gr.Blocks() as demo:
             [query, min_metadata_score, mim_model_card_length],
             [filter_results, results_markdown],
         )
-    with gr.Tab("Scoring metadata quality"):
-        with gr.Row():
-            gr.Markdown(
-                f"""
-            # Metadata quality scoring
-            ```
-            {COMMON_SCORES}
-            ```
-            For example, `TASK_TYPES_WITH_LANGUAGES` defines all the tasks for which it
-            is expected to have language metadata associated with the model.
-            ```
-            {TASK_TYPES_WITH_LANGUAGES}
-            ```
-            """
-            )
 demo.launch()

 import os
 import copy
 from dataclasses import asdict, dataclass
 from httpx_caching import CachingClient
 from httpx_caching import OneDayCacheHeuristic
+from cachetools import cached, TTLCache
+from datetime import timedelta
+from datetime import datetime
+cache = TTLCache(maxsize=500_000, ttl=timedelta(hours=24), timer=datetime.now)
 client = Client()
 client = CachingClient(client, heuristic=OneDayCacheHeuristic())
 class EngagementStats:
     likes: int
     downloads: int
+    created_at: datetime
 def _get_engagement_stats(hub_id):
 # @cache.memoize(expire=60 * 60 * 24 * 3)  # expires after 3 days
+@cached(cache)
 def _basic_check(hub_id):
     data = ModelMetadata.from_hub(hub_id)
     score = 0
     return f"https://huggingface.co/api/search/full-text?q={query}&limit=100&skip={skip}&type=model"
+@cached(cache)
 def get_results(query) -> Dict[Any, Any]:
     url = create_query_url(query)
     r = client.get(url)
     }
+def filter_for_license(results):
+    for result in results:
+        if result["is_licensed"]:
+            yield result
+def filter_for_min_model_card_length(results, min_model_card_length):
+    for result in results:
+        if result["model_card_length"] > min_model_card_length:
+            yield result
 def filter_search_results(
+    results: List[Dict[Any, Any]],
+    min_score=None,
+    min_model_card_length=None,
 ):  # TODO make code more intuitive
     results = thread_map(parse_single_result, results)
     for i, parsed_result in tqdm(enumerate(results)):
                 yield parsed_result
+def sort_search_results(
+    filtered_search_results,
+    first_sort="metadata_score",
+    second_sort="original_position",  # TODO expose these in results
+):
     return sorted(
         list(filtered_search_results),
+        key=lambda x: (x[first_sort], x[second_sort]),
         reverse=True,
     )
         # Get the start and end indices of the context window
         start = max(0, index - window_size)
         end = min(len(words), index + window_size + 1)
         return " ".join(words[start:end])
     except ValueError:
         return " ".join(words[:window_size])
+def create_markdown(results):  # TODO move to separate file
     rows = []
     for result in results:
         row = f"""# [{result['name']}]({result['repo_hub_url']})
     # for result in filtered_results:
     #     result_text = httpx.get(result["search_result_file_url"]).text
     #     result["text"] = find_context(result_text, query, 100)
     #     final_results.append(result)
     final_results = thread_map(get_result_card_snippet, filtered_results)
     percent_of_original = round(
             [query, min_metadata_score, mim_model_card_length],
             [filter_results, results_markdown],
         )
+    # with gr.Tab("Scoring metadata quality"):
+    #     with gr.Row():
+    #         gr.Markdown(
+    #             f"""
+    #         # Metadata quality scoring
+    #         ```
+    #         {COMMON_SCORES}
+    #         ```
+    #         For example, `TASK_TYPES_WITH_LANGUAGES` defines all the tasks for which it
+    #         is expected to have language metadata associated with the model.
+    #         ```
+    #         {TASK_TYPES_WITH_LANGUAGES}
+    #         ```
+    #         """
+    #         )
 demo.launch()