How to preserve additional keys when using "SELECT DISTINCT"?

111 views Asked by At

I'm looking to preserve the sid, and cid pairs that link my tables when using SELECT DISTINCT in my query. signature, ip_src, and ip_dst is what makes it distinct. I just want the output to also include the corresponding sid and cid pairs.

QUERY:

SELECT DISTINCT signature, ip_src, ip_dst FROM
     (SELECT *
          FROM event
          INNER  JOIN sensor ON (sensor.sid = event.sid)
          INNER  JOIN iphdr ON (iphdr.cid = event.cid) AND (iphdr.sid = event.sid)
          WHERE timestamp >= NOW() - '1 day'::INTERVAL
          ORDER BY timestamp DESC)
as d_dup;

OUTPUT:

 signature |   ip_src   |   ip_dst   
-----------+------------+------------
     29177 | 3244829114 | 2887777034
     29177 | 2960340989 | 2887777034
     29179 | 2887777893 | 2887777556
     29178 | 1208608738 | 2887777034
     29178 | 1211607091 | 2887777034
     29177 |  776526845 | 2887777034
     29177 | 1332731268 | 2887777034
(7 rows)

SUB QUERY:

SELECT *
          FROM event
          INNER  JOIN sensor ON (sensor.sid = event.sid)
          INNER  JOIN iphdr ON (iphdr.cid = event.cid) AND (iphdr.sid = event.sid)
          WHERE timestamp >= NOW() - '1 day'::INTERVAL
          ORDER BY timestamp DESC;

OUTPUT:

 sid |  cid  | signature |        timestamp        | sid |      hostname       | interface | filter | detail | encoding | last_cid | sid |  cid  |   ip_src   |   ip_dst   | ip_ver | ip_hlen | ip_tos | ip_len | ip_id | ip_flags | ip_off | ip_ttl | ip_proto | ip_csum 
-----+-------+-----------+-------------------------+-----+---------------------+-----------+--------+--------+----------+----------+-----+-------+------------+------------+--------+---------+--------+--------+-------+----------+--------+--------+----------+---------
   3 | 13123 |     29177 | 2014-11-15 20:53:14.656 |   3 | VS-101-Z0:dna0:dna1 | dna0:dna1 |        |      1 |        0 |    12888 |   3 | 13123 | 3244829114 | 2887777034 |      4 |       5 |      0 |    344 | 19301 |        0 |      0 |    122 |        6 |    8686
   3 | 13122 |     29177 | 2014-11-15 20:53:14.43  |   3 | VS-101-Z0:dna0:dna1 | dna0:dna1 |        |      1 |        0 |    12888 |   3 | 13122 | 3244829114 | 2887777034 |      4 |       5 |      0 |     69 | 19071 |        0 |      0 |    122 |        6 |    9191
   3 | 13121 |     29177 | 2014-11-15 18:45:13.461 |   3 | VS-101-Z0:dna0:dna1 | dna0:dna1 |        |      1 |        0 |    12888 |   3 | 13121 | 3244829114 | 2887777034 |      4 |       5 |      0 |    366 | 25850 |        0 |      0 |    122 |        6 |    2115
   3 | 13120 |     29177 | 2014-11-15 18:45:13.23  |   3 | VS-101-Z0:dna0:dna1 | dna0:dna1 |        |      1 |        0 |    12888 |   3 | 13120 | 3244829114 | 2887777034 |      4 |       5 |      0 |     69 | 25612 |        0 |      0 |    122 |        6 |    2650
   3 | 13119 |     29177 | 2014-11-15 18:45:01.887 |   3 | VS-101-Z0:dna0:dna1 | dna0:dna1 |        |      1 |        0 |    12888 |   3 | 13119 | 3244829114 | 2887777034 |      4 |       5 |      0 |    352 | 13697 |        0 |      0 |    122 |        6 |   14282
   3 | 13118 |     29177 | 2014-11-15 18:45:01.681 |   3 | VS-101-Z0:dna0:dna1 | dna0:dna1 |        |      1 |        0 |    12888 |   3 | 13118 | 3244829114 | 2887777034 |      4 |       5 |      0 |     69 | 13464 |        0 |      0 |    122 |        6 |   14798
   4 |    51 |     29179 | 2014-11-15 18:44:02.06  |   4 | VS-101-Z1:dna2:dna3 | dna2:dna3 |        |      1 |        0 |       51 |   4 |    51 | 2887777893 | 2887777556 |      4 |       5 |      0 |     80 | 18830 |        0 |      0 |     63 |       17 |   40533
   3 | 13117 |     29177 | 2014-11-15 18:41:46.418 |   3 | VS-101-Z0:dna0:dna1 | dna0:dna1 |        |      1 |        0 |    12888 |   3 | 13117 | 1332731268 | 2887777034 |      4 |       5 |      0 |    261 | 15393 |        0 |      0 |    119 |        6 |   62131
...
(30 rows)

How do I keep the sid, and cid when using SELECT DISTINCT?

3

There are 3 answers

0
AudioBubble On BEST ANSWER

Sounds like you are looking for a window function:

SELECT *
FROM (
  SELECT *,
         row_number() over (partition by signature, ip_src, ip_dst order by timestamp desc) as rn
  FROM event
     JOIN sensor ON sensor.sid = event.sid
     JOIN iphdr ON iphdr.cid = event.cid AND iphdr.sid = event.sid
  WHERE timestamp >= NOW() - interval '1' day
) as d_dup
where rn = 1
order by timestamp desc;
0
Erwin Brandstetter On

This is shorter and probably faster:

SELECT DISTINCT ON (signature, ip_src, ip_dst)
       signature, ip_src, ip_dst, sid, cid
FROM   event  e
JOIN   sensor s USING (sid)
JOIN   iphdr  i USING (cid, sid)
WHERE  timestamp >= NOW() - '1 day'::interval
ORDER  BY signature, ip_src, ip_dst, timestamp DESC;

Assuming you want the latest row (greatest timestamp) from each set of dupes.
Detailed explanation:

7
Multisync On

Maybe something like this?

SELECT DISTINCT e.sid, e.cid, ip_src, ip_dst
FROM event e
INNER  JOIN sensor s ON (s.sid = e.sid)
INNER  JOIN iphdr i ON (i.cid = e.cid) AND (i.sid = e.sid)
WHERE timestamp >= NOW() - '1 day'::INTERVAL;

If you want the combination of (signature, ip_src, ip_dst) to be unique in the result (one row for each combination) then you can try something like this:

SELECT max(e.cid), max(e.sid), signature, ip_src, ip_dst
FROM event e
INNER  JOIN sensor s ON (s.sid = e.sid)
INNER  JOIN iphdr i ON (i.cid = e.cid) AND (i.sid = e.sid)
WHERE timestamp >= NOW() - '1 day'::INTERVAL
GROUP BY signature, ip_src, ip_dst;

But it will give max cid and sid for each combination