prune_objects can prune orphaned activities who reference an array of objects

E.g. Flag activities have an array of objects

We prune the activity when NONE of the objects can be found

Note that the cost of finding and deleting these is ~4x higher than finding and deleting the non-array ones

Only string:
Delete on activities  (cost=506573.48..506580.38 rows=0 width=0)

Only Array:
Delete on activities  (cost=3570359.68..4276365.34 rows=0 width=0)

(They are still executed separately, so the total cost is the sum of the two)
This commit is contained in:
ilja 2023-01-08 18:22:53 +01:00
parent a7ec6e039c
commit 57eef6d764
2 changed files with 94 additions and 18 deletions

View file

@ -172,6 +172,7 @@ def run(["prune_objects" | args]) do
|> Repo.delete_all(timeout: :infinity)
if Keyword.get(options, :prune_orphaned_activities) do
# Prune activities who link to a single object
"""
delete from public.activities
where id in (
@ -179,28 +180,40 @@ def run(["prune_objects" | args]) do
left join public.objects o on a.data ->> 'object' = o.data ->> 'id'
left join public.activities a2 on a.data ->> 'object' = a2.data ->> 'id'
left join public.users u on a.data ->> 'object' = u.ap_id
-- Only clean up remote activities
where not a.local
-- For now we only focus on activities with direct links to objects
-- e.g. not json objects (in case of embedded objects) or json arrays (in case of multiple objects)
and jsonb_typeof(a."data" -> 'object') = 'string'
-- Find Activities that don't have existing objects
and o.id is null
and a2.id is null
and u.id is null
)
"""
|> Repo.query()
|> Repo.query([], timeout: :infinity)
# Prune activities who link to an array of objects
"""
delete from public.activities
where id in (
select a.id from public.activities a
join json_array_elements_text((a."data" -> 'object')::json) as j on jsonb_typeof(a."data" -> 'object') = 'array'
left join public.objects o on j.value = o.data ->> 'id'
left join public.activities a2 on j.value = a2.data ->> 'id'
left join public.users u on j.value = u.ap_id
group by a.id
having max(o.data ->> 'id') is null
and max(a2.data ->> 'id') is null
and max(u.ap_id) is null
)
"""
|> Repo.query([], timeout: :infinity)
end
prune_hashtags_query = """
"""
DELETE FROM hashtags AS ht
WHERE NOT EXISTS (
SELECT 1 FROM hashtags_objects hto
WHERE ht.id = hto.hashtag_id)
"""
Repo.query(prune_hashtags_query)
|> Repo.query()
if Keyword.get(options, :vacuum) do
Maintenance.vacuum("full")

View file

@ -354,7 +354,7 @@ test "with the --keep-threads option it keeps old threads with bookmarked posts"
assert length(Repo.all(Object)) == 1
end
test "We don't have unexpected tables which can contain objects that are referenced by activities" do
test "We don't have unexpected tables which may contain objects that are referenced by activities" do
# We can delete orphaned activities. For that we look for the objects they reference in the 'objects', 'activities', and 'users' table.
# If someone adds another table with objects (idk, maybe with separate relations, or collections or w/e), then we need to make sure we
# add logic for that in the 'prune_objects' task so that we don't wrongly delete their corresponding activities.
@ -481,6 +481,69 @@ test "it prunes orphaned activities with the --prune-orphaned-activities" do
assert length(activities) == 4
end
test "it prunes orphaned activities with the --prune-orphaned-activities when the objects are referenced from an array" do
%Object{} |> Map.merge(%{data: %{"id" => "existing_object"}}) |> Repo.insert()
%User{} |> Map.merge(%{ap_id: "existing_actor"}) |> Repo.insert()
# Multiple objects, one object exists (keep)
%Activity{}
|> Map.merge(%{
local: false,
data: %{
"id" => "remote_activity_existing_object",
"object" => ["non_ existing_object", "existing_object"]
}
})
|> Repo.insert()
# Multiple objects, one actor exists (keep)
%Activity{}
|> Map.merge(%{
local: false,
data: %{
"id" => "remote_activity_existing_actor",
"object" => ["non_ existing_object", "existing_actor"]
}
})
|> Repo.insert()
# Multiple objects, one activity exists (keep)
%Activity{}
|> Map.merge(%{
local: false,
data: %{
"id" => "remote_activity_existing_activity",
"object" => ["non_ existing_object", "remote_activity_existing_actor"]
}
})
|> Repo.insert()
# Multiple objects none exist (prune)
%Activity{}
|> Map.merge(%{
local: false,
data: %{
"id" => "remote_activity_without_existing_referenced_object",
"object" => ["owo", "whats_this"]
}
})
|> Repo.insert()
assert length(Repo.all(Activity)) == 4
Mix.Tasks.Pleroma.Database.run(["prune_objects"])
assert length(Repo.all(Activity)) == 4
Mix.Tasks.Pleroma.Database.run(["prune_objects", "--prune-orphaned-activities"])
activities = Repo.all(Activity)
assert length(activities) == 3
assert "remote_activity_without_existing_referenced_object" not in Enum.map(
activities,
fn a -> a.data["id"] end
)
assert length(activities) == 3
end
end
describe "running update_users_following_followers_counts" do